In [20]:
import transformers
print(transformers.__version__)


4.57.0


In [1]:
!pip install -q transformers datasets torch scikit-learn pandas



In [18]:
!pip install --upgrade torch transformers[torch] accelerate


Collecting torch
  Downloading torch-2.9.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting accelerate
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Downloading torch-2.9.0-cp313-cp313-win_amd64.whl (109.3 MB)
   ---------------------------------------- 0.0/109.3 MB ? eta -:--:--
   ---------------------------------------- 0.3/109.3 MB ? eta -:--:--
   ---------------------------------------- 0.8/109.3 MB 2.1 MB/s eta 0:00:53
   ---------------------------------------- 1.0/109.3 MB 2.4 MB/s eta 0:00:45
    --------------------------------------- 1.8/109.3 MB 2.4 MB/s eta 0:00:45
    --------------------------------------- 2.6/109.3 MB 2.8 MB/s eta 0:00:39
   - -------------------------------------- 3.4/109.3 MB 2.9 MB/s eta 0:00:37
   - -------------------------------------- 5.0/109.3 MB 3.6 MB/s eta 0:00:30
   -- ------------------------------------- 6.0/109.3 MB 3.8 MB/s eta 0:00:28
   -- ------------------------------------- 7.6/109.3 MB 4.2 MB/s eta 0:00:2

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.8.0 requires torch==2.8.0, but you have torch 2.9.0 which is incompatible.
torchvision 0.23.0 requires torch==2.8.0, but you have torch 2.9.0 which is incompatible.


In [None]:
# ===============================================================
# 📘 STEP 1: Install and Import Dependencies
# ===============================================================
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# ===============================================================
# 📘 STEP 2: Load Your CSV Dataset
# ===============================================================
# Replace with your file path, e.g., "/content/complaints.csv"
csv_path = "consumer_complaints_sampled.csv"
df = pd.read_csv(csv_path)

# Inspect the data
print(df.head())

# Check for nulls
print(df.isnull().sum())

# Drop rows with missing text or labels
df = df.dropna(subset=["Consumer complaint narrative", "Label"])

# Convert labels to int if needed
df["Label"] = df["Label"].astype(int)

                        Consumer complaint narrative  \
0  In XXXX of 2010 I purchased a Toyota. I did no...   
1  On XX/XX/XXXX I called Concord and I spoke wit...   
2  My mortgage servicer is Nationstar DBA Mr. Coo...   
3  Back in XXXX I had a lawyer file with the cour...   
4  I have received letters stating that they have...   

                                             Product  Label  
0                                      Consumer Loan      2  
1                                    Debt collection      1  
2                                           Mortgage      3  
3                                    Debt collection      1  
4  Credit reporting, credit repair services, or o...      0  
Consumer complaint narrative    0
Product                         0
Label                           0
dtype: int64


In [4]:
# ===============================================================
# 📘 STEP 3: Train-Test Split
# ===============================================================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Label"])

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [5]:
# ===============================================================
# 📘 STEP 4: Tokenization
# ===============================================================
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["Consumer complaint narrative"],
        truncation=True,
        padding=False,  # handled later by DataCollator
        max_length=256
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████| 160000/160000 [12:07<00:00, 219.80 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 40000/40000 [03:04<00:00, 217.30 examples/s]


In [6]:
# ===============================================================
# 📘 STEP 5: Data Collator
# ===============================================================
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [7]:
# ===============================================================
# 📘 STEP 6: Load Model
# ===============================================================
num_labels = df["Label"].nunique()
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# ===============================================================
# 📘 STEP 7: Metrics Function
# ===============================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:

training_args = TrainingArguments(
    output_dir="./bert_finetuned_product_classification",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [21]:
# ===============================================================
# 📘 STEP 9: Trainer Setup
# ===============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



NameError: name 'training_args' is not defined

In [None]:
# ===============================================================
# 📘 STEP 10: Train the Model
# ===============================================================
trainer.train()


In [None]:
# ===============================================================
# 📘 STEP 11: Evaluate
# ===============================================================
metrics = trainer.evaluate()
print(metrics)


In [None]:
# ===============================================================
# 📘 STEP 12: Save Model
# ===============================================================
trainer.save_model("./bert_finetuned_product_classifier")
tokenizer.save_pretrained("./bert_finetuned_product_classifier")


In [None]:
# ===============================================================
# 📘 STEP 13: Test on New Example
# ===============================================================
sample_text = "I was wrongly charged by the debt collector for a loan I never had."
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True, max_length=256)
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, dim=1).item()

print(f"Predicted Label: {pred}")
