# Banking Data classifier

In [2]:
import os
# gets rid of irritating warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset


banking77 = load_dataset("PolyAI/banking77")

df_raw_train = banking77["train"].to_pandas()
df_raw_test = banking77["test"].to_pandas()

In [4]:
df_text_train = df_raw_train["text"]

print("types in the train set: ", df_text_train.apply(type).value_counts())

df_text_test = df_raw_test["text"]
print("types in the test set: ", df_text_test.apply(type).value_counts())

df_text_train = df_text_train.str.strip()
df_text_test = df_text_test.str.strip()




types in the train set:  text
<class 'str'>    10003
Name: count, dtype: int64
types in the test set:  text
<class 'str'>    3080
Name: count, dtype: int64


Manual Data Cleaning

In [5]:
df_raw_train_filtered = df_raw_train[df_raw_train["text"].str.len() > 3]
df_raw_test_filtered = df_raw_test[df_raw_test["text"].str.len() > 3]

df_raw_train_filtered = df_raw_train_filtered.reset_index(drop=True)
df_raw_test_filtered = df_raw_test_filtered.reset_index(drop=True)

df_raw_train_filtered = df_raw_train_filtered.drop_duplicates(subset=["text"])
df_raw_test_filtered = df_raw_test_filtered.drop_duplicates(subset=["text"])

df_raw_train_filtered = df_raw_train_filtered.reset_index(drop=True)
df_raw_test_filtered = df_raw_test_filtered.reset_index(drop=True)

print(df_raw_train_filtered)
print(df_raw_test_filtered)


                                                    text  label
0                         I am still waiting on my card?     11
1      What can I do if my card still hasn't arrived ...     11
2      I have been waiting over a week. Is the card s...     11
3      Can I track my card while it is in the process...     11
4      How do I know if I will get my card, or if it ...     11
...                                                  ...    ...
9998              You provide support in what countries?     24
9999                  What countries are you supporting?     24
10000                What countries are getting support?     24
10001                     Are cards available in the EU?     24
10002                   Which countries are represented?     24

[10003 rows x 2 columns]
                                                   text  label
0                              How do I locate my card?     11
1     I still have not received my new card, I order...     11
2     I ordered a

In [12]:
import matplotlib as plt

counts = df_raw_train_filtered["label"].value_counts()

counts = counts.sort_values(ascending=False)

for i, c in counts.items(): print(f"{i}: {c}")

15: 187
28: 182
6: 181
75: 180
19: 177
63: 175
26: 173
64: 172
5: 171
66: 171
52: 169
16: 168
17: 167
34: 166
76: 163
51: 162
53: 161
20: 160
0: 159
45: 159
8: 157
7: 156
25: 153
11: 153
47: 149
48: 148
61: 146
59: 145
46: 143
13: 139
35: 137
73: 135
27: 133
9: 129
39: 129
54: 129
24: 129
67: 128
4: 127
2: 126
36: 126
71: 126
21: 122
29: 121
74: 121
31: 121
42: 121
30: 121
43: 120
33: 118
49: 115
57: 114
58: 114
65: 113
70: 113
12: 112
32: 112
14: 112
56: 111
1: 110
55: 108
38: 106
44: 105
69: 104
62: 103
68: 102
40: 98
60: 97
37: 97
50: 95
3: 87
22: 86
41: 82
18: 61
10: 59
72: 41
23: 35


Cleanlab


In [28]:
from sklearn.utils import resample


df = resample(df_raw_train_filtered, replace=False, n_samples=10003, random_state=0, stratify=df_raw_train_filtered["label"])
df = df.reset_index(drop=True)
df

Unnamed: 0,text,label
0,I was checking the app for my account and I no...,28
1,Why would the ATM machine fail to give me the ...,26
2,Do you take both Visa and Mastercard?,73
3,I see cash in my app but I did not get it.,20
4,I want to activate my new card.,0
...,...,...
9998,"Hae, I already completed my 3D secure authenti...",61
9999,Do you accept Visa or Mastercard?,73
10000,At what age can my children use your service?,1
10001,"I ordered something, and now I have buyer's re...",52


In [31]:
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict


def get_initial_model_data(texts: list[str], labels: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Compute embeddings and out-of-sample class probabilities for Cleanlab."""
    # 1. Compute text embeddings with sentence-transformers (on GPU via CUDA)
    model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
    embeddings = model.encode(texts, show_progress_bar=True)  # 384 dimensions for each text

    # 2. Create LogisticRegression model for 77 classes (multinomial is the default)
    logistic_regression_model = LogisticRegression(
        random_state=0,
        C=1.0,
        solver="lbfgs",
        class_weight="balanced",
        max_iter=1000,
        n_jobs=-1,
    )

    # 3. Compute out-of-sample predicted probabilities using cross_val_predict
    pred_probs = cross_val_predict(
        logistic_regression_model,
        embeddings,
        labels,
        cv=5,
        method="predict_proba",  # returns probabilities, not hard labels
        n_jobs=-1,
    )
    print("Cross-validation predictions computed.")

    return embeddings, pred_probs

In [32]:
texts = df["text"].tolist()
labels = df["label"].to_numpy(dtype=int)

embeddings, pred_probs = get_initial_model_data(texts, labels)

# Verification
assert len(embeddings) == len(pred_probs)
assert pred_probs.ndim == 2

print(f"✅ Embeddings shape: {embeddings.shape}")
print(f"✅ Pred_probs shape: {pred_probs.shape}")
print("✅ Verification passed!")

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Cross-validation predictions computed.
✅ Embeddings shape: (10003, 384)
✅ Pred_probs shape: (10003, 77)
✅ Verification passed!


In [33]:
from cleanlab import Datalab


data_dict = {"texts": texts, "labels": labels}
lab = Datalab(data_dict, label_name="labels", task="classification")
lab.find_issues(pred_probs=pred_probs, features=embeddings)
lab.report()


Finding null issues ...
Finding label issues ...
Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 643 issues found in the dataset.
Dataset Information: num_examples: 10003, num_classes: 77

Here is a summary of various issues found in your data:

    issue_type  num_issues
near_duplicate         350
         label         162
       outlier         131

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more examples in
    a dataset that are extremely similar to each other, relative
    to the rest of the datase

In [40]:
duplicate_issues = lab.get_issues("near_duplicate")
duplicate_issues = duplicate_issues[duplicate_issues["is_near_duplicate_issue"]]
duplicate_issues = duplicate_issues.sort_values(by="near_duplicate_score")

In [48]:
for idx, row in duplicate_issues.head(10).iterrows():
    text = texts[idx]
    neighbor_idxs = row["near_duplicate_sets"][:3]
    neighbors = [texts[i] for i in neighbor_idxs]
    print(f"Text: {text}")
    print(f"\tDuplicates: {neighbors}")
    print()

Text: 
I put the wrong pin too many times and now it is blocked. Can you help me unblock it?
	Duplicates: ['I put the wrong pin too many times and now it is blocked. Can you help me unblock it?']

Text: Where can I withdraw money from?
	Duplicates: ['\nWhere can I withdraw money from?']

Text: Why do you keep declining my payment?I tried several times already with this card and it is just not working.
	Duplicates: ['Why do you keep declining my payment? I tried several times already with this card and it is just not working.', 'Why do you keep declining my payment?I tried several times with this card and it is just not working.']

Text: Why do you keep declining my payment? I tried several times already with this card and it is just not working.
	Duplicates: ['Why do you keep declining my payment?I tried several times already with this card and it is just not working.', 'Why do you keep declining my payment?I tried several times with this card and it is just not working.']

Text: How c

In [49]:
for idx, row in duplicate_issues.tail(10).iterrows():
    text = texts[idx]
    neighbor_idxs = row["near_duplicate_sets"][:3]
    neighbors = [texts[i] for i in neighbor_idxs]
    print(f"Text: {text}")
    print(f"\tDuplicates: {neighbors}")
    print()

Text: What should I do if I can't prove my identity.
	Duplicates: ["What do I do if I can't prove my identity?"]

Text: How can I receive a virtual card?
	Duplicates: ['How do I receive a virtual card?']

Text: How do I receive a virtual card?
	Duplicates: ['How can I receive a virtual card?']

Text: Can you please explain why my transfer failed?
	Duplicates: ['Can you please tell me why my transfer failed?']

Text: Can you please tell me why my transfer failed?
	Duplicates: ['Can you please explain why my transfer failed?']

Text: Is there a limit to a disposable virtual card?
	Duplicates: ['Is there a limit to using a disposable virtual card?']

Text: Is there a limit to using a disposable virtual card?
	Duplicates: ['Is there a limit to a disposable virtual card?']

Text: Can I top up any amount?
	Duplicates: ['Can I top-up any amount?']

Text: Can I top-up any amount?
	Duplicates: ['Can I top up any amount?']

Text: How long will a transfer from the US take?
	Duplicates: ['How long

In [43]:
df["text_lower"] = df["text"].str.lower()
df_deduplicated = df.drop_duplicates(subset="text_lower")
df_deduplicated = df_deduplicated.drop(columns="text_lower")
df_deduplicated = df_deduplicated.reset_index(drop=True)
df_deduplicated

Unnamed: 0,text,label
0,I was checking the app for my account and I no...,28
1,Why would the ATM machine fail to give me the ...,26
2,Do you take both Visa and Mastercard?,73
3,I see cash in my app but I did not get it.,20
4,I want to activate my new card.,0
...,...,...
9998,"Hae, I already completed my 3D secure authenti...",61
9999,Do you accept Visa or Mastercard?,73
10000,At what age can my children use your service?,1
10001,"I ordered something, and now I have buyer's re...",52


Labels analyze

In [50]:
label_issues = lab.get_issues("label")
label_issues = label_issues[label_issues["is_label_issue"]]
label_issues = label_issues.sort_values(by="label_score")

# Inspect top 10 most likely label issues
top_label_issues = label_issues.head(10)
top_label_issues_y_true = top_label_issues["given_label"]
top_label_issues_y_pred = top_label_issues["predicted_label"]
top_label_issues_idxs = top_label_issues.index.tolist()
# texts is a Python list, so we index it explicitly with integers
top_label_issues_texts = [texts[i] for i in top_label_issues_idxs]

# Map label ids to human-readable names from the banking77 dataset
id2label = {i: name for i, name in enumerate(banking77["train"].features["label"].names)}

print("Top 10 label issues")
for text, y_true, y_pred in zip(top_label_issues_texts, top_label_issues_y_true, top_label_issues_y_pred):
    true_name = id2label[int(y_true)]
    pred_name = id2label[int(y_pred)]
    print(f"given_label {y_true} ({true_name}), predicted_label {y_pred} ({pred_name}), text: {text}")
    print()

Top 10 label issues
given_label 62 (topping_up_by_card), predicted_label 75 (wrong_amount_of_cash_received), text: what happened to the money after i put in the wrong info and it got declined

given_label 48 (pending_transfer), predicted_label 5 (balance_not_updated_after_bank_transfer), text: I have not seen any changes in my account balance(s) after making a transfer.

given_label 26 (declined_cash_withdrawal), predicted_label 49 (pin_blocked), text: my card was frozen due to putting in the wrong pin too much.  how many tries do i have

given_label 65 (transfer_into_account), predicted_label 62 (topping_up_by_card), text: How do I top up?

given_label 59 (top_up_failed), predicted_label 25 (declined_card_payment), text: My credit card transaction was declined for a loan payment.  Can you tell me why?

given_label 41 (lost_or_stolen_card), predicted_label 11 (card_arrival), text: Can you tell me the status of my new card?

given_label 67 (transfer_timing), predicted_label 5 (balance_n

In [51]:
# make sure we don't get key errors - we removed some rows earlier during deduplication
label_issues = label_issues[label_issues.index.isin(df_deduplicated.index)]

idxs = label_issues.index.tolist()
pred_labels = label_issues["predicted_label"]

df_fixed = df_deduplicated.copy()
df_fixed.loc[idxs, "label"] = pred_labels
df_fixed

Unnamed: 0,text,label
0,I was checking the app for my account and I no...,28
1,Why would the ATM machine fail to give me the ...,26
2,Do you take both Visa and Mastercard?,73
3,I see cash in my app but I did not get it.,20
4,I want to activate my new card.,0
...,...,...
9998,"Hae, I already completed my 3D secure authenti...",61
9999,Do you accept Visa or Mastercard?,73
10000,At what age can my children use your service?,1
10001,"I ordered something, and now I have buyer's re...",52


Outliers Analyze

In [52]:

outlier_issues = lab.get_issues("outlier")
outlier_issues = outlier_issues[outlier_issues["is_outlier_issue"]]
outlier_issues = outlier_issues.sort_values(by="outlier_score")

# Map label ids to human-readable names from the banking77 dataset
id2label = {i: name for i, name in enumerate(banking77["train"].features["label"].names)}

print("Outliers with strongest evidence (lowest outlier_score):")
for idx, row in outlier_issues.iterrows():
    text = texts[idx]
    score = row["outlier_score"]
    label_id = int(labels[idx])
    label_name = id2label[label_id]
    
    print(f"\n--- Outlier (Score: {score:.4f}) ---")
    print(f"Label: {label_name} (id={label_id})")
    print(f"Text: \"{text}\"")
    print("-" * 60)


Outliers with strongest evidence (lowest outlier_score):

--- Outlier (Score: 0.0051) ---
Label: card_acceptance (id=10)
Text: "WHAT IS THE ATMOSPHERE OF IT"
------------------------------------------------------------

--- Outlier (Score: 0.0063) ---
Label: direct_debit_payment_not_recognised (id=28)
Text: "what is the word?"
------------------------------------------------------------

--- Outlier (Score: 0.0079) ---
Label: cash_withdrawal_not_recognised (id=20)
Text: "What is this witdrawal"
------------------------------------------------------------

--- Outlier (Score: 0.0079) ---
Label: direct_debit_payment_not_recognised (id=28)
Text: "what is the matter?"
------------------------------------------------------------

--- Outlier (Score: 0.0087) ---
Label: card_arrival (id=11)
Text: "WHAT IS THE SOLUTION OF THIS PROBLEM"
------------------------------------------------------------

--- Outlier (Score: 0.0107) ---
Label: visa_or_mastercard (id=73)
Text: "I prefer Mastecard."
----