# Banking Data classifier

In [None]:
import os
# gets rid of irritating warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [22]:
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset


banking77 = load_dataset("PolyAI/banking77")

df_raw_train = banking77["train"].to_pandas()
df_raw_test = banking77["test"].to_pandas()

In [24]:
df_text_train = df_raw_train["text"]

print("types in the train set: ", df_text_train.apply(type).value_counts())

df_text_test = df_raw_test["text"]
print("types in the test set: ", df_text_test.apply(type).value_counts())

df_text_train = df_text_train.str.strip()
df_text_test = df_text_test.str.strip()




types in the train set:  text
<class 'str'>    10003
Name: count, dtype: int64
types in the test set:  text
<class 'str'>    3080
Name: count, dtype: int64


Manual Data Cleaning

In [6]:
df_raw_train_filtered = df_raw_train[df_raw_train["text"].str.len() > 3]
df_raw_test_filtered = df_raw_test[df_raw_test["text"].str.len() > 3]

df_raw_train_filtered = df_raw_train_filtered.reset_index(drop=True)
df_raw_test_filtered = df_raw_test_filtered.reset_index(drop=True)

df_raw_train_filtered = df_raw_train_filtered.drop_duplicates(subset=["text"])
df_raw_test_filtered = df_raw_test_filtered.drop_duplicates(subset=["text"])

df_raw_train_filtered = df_raw_train_filtered.reset_index(drop=True)
df_raw_test_filtered = df_raw_test_filtered.reset_index(drop=True)

print(df_raw_train_filtered)
print(df_raw_test_filtered)


                                                    text  label
0                         I am still waiting on my card?     11
1      What can I do if my card still hasn't arrived ...     11
2      I have been waiting over a week. Is the card s...     11
3      Can I track my card while it is in the process...     11
4      How do I know if I will get my card, or if it ...     11
...                                                  ...    ...
9998              You provide support in what countries?     24
9999                  What countries are you supporting?     24
10000                What countries are getting support?     24
10001                     Are cards available in the EU?     24
10002                   Which countries are represented?     24

[10003 rows x 2 columns]
                                                   text  label
0                              How do I locate my card?     11
1     I still have not received my new card, I order...     11
2     I ordered a

Cleanlab


In [None]:
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict


def get_initial_model_data(texts: list[str], labels: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    # 1. Compute text embeddings with sentence-transformers
    model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")  # Force CPU for compatibility # for GPU use device="cuda"
    embeddings = model.encode(texts, show_progress_bar=True) #384 dimentions for each text
    
    # 2. Create LogisticRegression object
    logistic_regression_model = LogisticRegression(random_state=0, C=0.1, solver='liblinear')
    
    # 3. Train logistic regression and get probability predictions using cross_val_predict
    pred_probs = cross_val_predict(
        logistic_regression_model,
        embeddings,
        labels,
        cv=5,  
        method="predict_proba",  # Important: returns probabilities, not 0/1 labels
        n_jobs=-1 
    )
    print("Cross-validation predictions computed.")

    return embeddings, pred_probs

In [None]:
texts = df["text"].values
labels = df["label"].values

embeddings, pred_probs = get_initial_model_data(texts, labels)

# Verification
assert len(embeddings) == len(pred_probs)
assert pred_probs.ndim == 2

print(f"✅ Embeddings shape: {embeddings.shape}")
print(f"✅ Pred_probs shape: {pred_probs.shape}")
print(f"✅ Verification passed!")