In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import(
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
import torch
import gc
from torch.utils.data import Dataset

gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

Training Data

In [2]:
df = pd.read_excel('Pralov.xls', skiprows=8)
df = df.dropna()

print(df)

    Reference Code              Date Time  \
0          0WPVCY9    2025-05-28 14:32:57   
1          0XRKVN9  2025-05-27 21:59:03.0   
2          0XRHORM  2025-05-27 20:51:19.0   
3          0XR8ZO5  2025-05-27 19:10:16.0   
4          0XQDXTQ  2025-05-27 11:20:44.0   
..             ...                    ...   
133        0VCPH57  2025-04-03 16:04:08.0   
134        0VCPDU2  2025-04-03 16:02:41.0   
135        0VC723Q  2025-04-03 10:56:21.0   
136        0VC723Q  2025-04-03 10:56:21.0   
137        0V9K8PE  2025-04-01 15:58:06.0   

                                   Description    Dr.    Cr.    Status  \
0        Paid for Dipesh Hair Cutting & Saloon  150.0    0.0  COMPLETE   
1             Fund Transferred by Biplov Malla    0.0  200.0  COMPLETE   
2      Fund Transferred to Bel Prasad Shrestha  400.0    0.0  COMPLETE   
3                  Paid for NANDINI FOOD COURT  370.0    0.0  COMPLETE   
4                     Paid for Shahi Suppliers   80.0    0.0  COMPLETE   
..             

## Basic Text Preprocessing

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
from nltk.tokenize import word_tokenize
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

In [5]:
def textPreprocessing(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()  
    text = word_tokenize(text)
    return text

def stopwordRemoval(text):
    text = [word for word in text if word not in stopwords]
    return text

def lemmatization(text):
    text = [lemmatizer.lemmatize(word) for word in text]
    return text 

def posTagging(text):
    text = nltk.pos_tag(text)
    text = [word for word, tag in text if tag.startswith('N') or tag.startswith('V')]   
    return text

def cleanTextPipeline(text):
    text = textPreprocessing(text)
    text = stopwordRemoval(text)
    text = lemmatization(text)
    text = posTagging(text)
    text = ' '.join(text)
    return text

df['processedDescription'] = df["Description"].apply(lambda x : cleanTextPipeline(x))

print(df[['Description', 'processedDescription']])


                                   Description  \
0        Paid for Dipesh Hair Cutting & Saloon   
1             Fund Transferred by Biplov Malla   
2      Fund Transferred to Bel Prasad Shrestha   
3                  Paid for NANDINI FOOD COURT   
4                     Paid for Shahi Suppliers   
..                                         ...   
133  Fund Transferred to Ashraya Jung Sijapati   
134                   Paid for HAMRO ADDA CAFE   
135                      Bank transfer charges   
136      Money transferred to SANIMA BANK LTD.   
137          Paid for Gopi    Krishna    Store   

                 processedDescription  
0            paid hair cutting saloon  
1              fund transferred malla  
2    fund transferred prasad shrestha  
3                     paid food court  
4                 paid shahi supplier  
..                                ...  
133    fund transferred jung sijapati  
134              paid hamro adda cafe  
135              bank transfer charge  

Spacy Usage

In [6]:
def cleanTextPipelineSpacy(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()

    doc = nlp(text) 

    tokens = [token for token in doc if not token.is_stop and not token.is_punct]

    lemmas = [token.lemma_ for token in tokens]

    filtered_tokens = [lemma for lemma, token in zip(lemmas, tokens) if token.pos_ in ['NOUN', 'VERB']]

    return ' '.join(filtered_tokens)

new_try = df['Description'].apply(lambda x: cleanTextPipelineSpacy(x))

print(new_try)

0             pay dipesh hair cut saloon
1                          fund transfer
2      fund transfer bel prasad shrestha
3                         pay food court
4                           pay supplier
                     ...                
133                        fund transfer
134                                  pay
135                 bank transfer charge
136                       money transfer
137                       pay gopi store
Name: Description, Length: 138, dtype: object


In [7]:
print(df.columns)

Index(['Reference Code', 'Date Time', 'Description', 'Dr.', 'Cr.', 'Status',
       'Balance (NPR)', 'Channel', 'Category', 'processedDescription'],
      dtype='object')


## Model Testing

In [8]:
class TransactionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Convert texts and labels to lists to ensure indexing works properly
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Verify data integrity
        assert len(self.texts) == len(self.labels), "Texts and labels must have the same length"
        print(f"Dataset created with {len(self.texts)} examples")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError(f"Index {idx} out of bounds for dataset of size {len(self.texts)}")
        
        try:
            text = str(self.texts.iloc[idx])
            label = self.labels.iloc[idx]
            
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
        
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),    
                'labels': torch.tensor(label, dtype=torch.long)
            }
        except Exception as e:
            print(f"Error processing item at index {idx}")
            print(f"Text: {self.texts.iloc[idx] if idx < len(self.texts) else 'Index out of bounds'}")
            print(f"Label: {self.labels.iloc[idx] if idx < len(self.labels) else 'Index out of bounds'}")
            raise e

Prepare transaction data

In [9]:
def prepare_bert_data(df):
    df['combined_text'] = df['processedDescription'] + ' ' + df['Dr.'].astype(str) + ' ' + df['Cr.'].astype(str) + ' ' + df['Balance (NPR)'].astype(str) + ' ' + df['Channel'] + ' ' + df['Status']
    unique_labels = df['Category'].unique()
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
    id_to_label = {idx: label for label, idx in label_to_id.items()}

    df['label_id'] = df['Category'].map(label_to_id)

    print(f"Found {len(unique_labels)} unique labels.")
    print("Label mappings:")
    for label, idx in label_to_id.items():
        print(f" {idx}: {label}")
    
    print("\nLabel ID range in dataset:")
    print(f"Min: {df['label_id'].min()}, Max: {df['label_id'].max()}")
    
    return df, label_to_id, id_to_label

In [10]:
def train_bert_model(df, model_name = "distilbert-base-uncased", test_size = 0.2, epochs = 3, batch_size = 16):
    df, label_to_id, id_to_label = prepare_bert_data(df)
    
    X = df['combined_text']
    Y = df['label_id']
    
    # Print dataset statistics
    print(f"Total dataset size: {len(df)}")
    print(f"Number of unique labels: {len(label_to_id)}")
    print("Label distribution:")
    for label, count in df['Category'].value_counts().items():
        print(f" - {label}: {count}")
    
    train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=test_size, random_state=42)   
    
    print(f"\nTrain set size: {len(train_X)}")
    print(f"Test set size: {len(test_X)}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id))
    
    train_dataset = TransactionDataset(train_X, train_Y, tokenizer)
    test_dataset = TransactionDataset(test_X, test_Y, tokenizer)
    
    # Verify datasets
    print("\nVerifying datasets:")
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    training_args = TrainingArguments(
        output_dir='./transaction_classifier',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_steps=50,               
        do_eval=True,                 
        save_steps=50,                
        save_total_limit=2,  
        dataloader_num_workers=0,
        dataloader_pin_memory=True,          
    )
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
    )

    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'CPU'}")

    print(f"Number of labels in mapping: {len(label_to_id)}")
    print(f"Model output dimension: {model.config.num_labels}")
    
    print("Starting the BERT model training...")
    print(f"This will take approximately {epochs * len(train_dataset) // batch_size} minutes")
    trainer.train()
    
    print("Evaluating the model...")
    predictions = trainer.predict(test_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)
    
    accuracy = accuracy_score(test_Y, pred_labels)
    
    print(f"\nValidation Accuracy: {accuracy:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(
        test_Y.values, pred_labels, target_names=[id_to_label[i] for i in range(len(id_to_label))]
    ))
    
    return trainer, tokenizer, (label_to_id, id_to_label)

In [11]:
def predict_with_bert(trainer, tokenizer, label_mappings, texts, return_probabilities = False):
    label_to_id, id_to_label = label_mappings
    model = trainer.model
    num_labels = len(id_to_label)

    print(f"Making predictions for {len(texts)} transactions...")

    encodings = tokenizer(
        texts, 
        truncation=True, 
        padding=True, 
        return_tensors='pt', 
        max_length=128  
    )

    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    pred_labels = torch.argmax(predictions, dim=-1).numpy()
    
    # Add safety check for label indices
    final_predictions = []
    for label in pred_labels:
        if label >= num_labels:
            print(f"Warning: Invalid label index {label}, defaulting to 0")
            final_predictions.append(id_to_label[0])
        else:
            final_predictions.append(id_to_label[label])

    if return_probabilities:
        return final_predictions, predictions.numpy()
    
    return final_predictions
    

In [12]:
def bert_usage():
    print("=" * 60)
    print("Transaction Classification with BERT")
    print("=" * 60)

    trainer, tokenizer, label_mappings = train_bert_model(df, epochs = 2, batch_size = 4)   

    new_transactions = [
        "Paid for medical store",
        "Uber ride to airport", 
        "Netflix subscription payment",
        "Grocery shopping at supermarket"
    ]
    
    print("\n" + "=" * 40)
    print("TESTING PREDICTIONS")
    print("=" * 40)
    
    predictions = predict_with_bert(trainer, tokenizer, label_mappings, new_transactions)
    
    for transaction, prediction in zip(new_transactions, predictions):
        print(f"Transaction: '{transaction}'")
        print(f"Predicted Category: {prediction}")
        print("-" * 40)

In [13]:
# bert_usage()

In [14]:
import transformers
print(transformers.__version__)

4.52.4
