In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
import random
import os

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Set seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device.type == "cuda":
    torch.cuda.manual_seed_all(seed)

Using device: cpu


In [6]:
# CELL 3 — Load dataset

csv_path = "/content/drive/MyDrive/SmartSpend/smart-spend/data/transactions.csv"
df = pd.read_csv(csv_path)

print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nClass distribution:")
print(df['category'].value_counts())


           merchant category
0  Starbucks Coffee     Food
1         McDonalds     Food
2   Subway Sandwich     Food
3         KFC Order     Food
4     Dominos Pizza     Food

Columns: ['merchant', 'category']

Class distribution:
category
Food             60
Shopping         60
Bills            60
Groceries        60
Entertainment    60
Travel           59
Fuel             59
Healthcare       59
Name: count, dtype: int64


In [7]:
# CELL 4 — Text cleaning & normalization

import re

def basic_clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)     # keep only letters + spaces
    text = re.sub(r'\s+', ' ', text)             # collapse multiple spaces
    return text.strip()

def normalize_abbreviations(text):
    replace_map = {
        "swgy": "swiggy",
        "zmt": "zomato",
        "brgr": "burger",
        "kng": "king",
        "domnos": "dominos",
        "pzza": "pizza",
        "amzn": "amazon",
        "prchs": "purchase",
        "fashn": "fashion",
        "nyka": "nykaa",
        "dmart": "d mart",
        "bazr": "bazaar",
        "hpcl": "hp",
        "iocl": "indian oil",
        "bpcl": "bp",
        "bunq": "bunk",
        "brdbnd": "broadband",
        "rchg": "recharge",
        "subscrn": "subscription",
        "mbrshp": "membership",
        "yt": "youtube",
        "gm": "game",
        "mdcn": "medicine",
    }
    for k, v in replace_map.items():
        text = re.sub(rf"\b{k}\b", v, text)
    return text

def normalize_brands(text):
    brand_map = {
        "ccd": "cafe coffee day",
        "k f c": "kfc",
        "kfc": "kfc",
        "wow momo": "wow momos",
        "big bazr": "big bazaar",
        "vistara flt": "vistara flight",
        "indigo": "indigo flight",
        "mmt": "make my trip",
        "pvr cinemaa": "pvr cinemas",
        "inox movi": "inox movies",
        "apollo phar": "apollo pharmacy",
        "medplus medical": "medplus",
        "1mg": "tata 1mg",
    }
    for k, v in brand_map.items():
        if k in text:
            text = text.replace(k, v)
    return text

def enrich_keywords(text):
    t = text
    if any(w in t for w in ["fuel", "petrol", "diesel", "bunk", "pump", "station"]):
        t += " fuel"
    if any(w in t for w in ["shop", "store", "mart", "bazaar", "bazr", "retail", "supermarket"]):
        t += " shopping"
    if any(w in t for w in ["hospital", "clinic", "pharmacy", "medical", "doctor", "lab", "test"]):
        t += " healthcare"
    if any(w in t for w in ["ticket", "flight", "train", "bus", "cab", "taxi", "autos", "metro", "toll"]):
        t += " travel"
    if any(w in t for w in ["subscription", "recharge", "bill", "emi", "fee", "charges"]):
        t += " bills"
    if any(w in t for w in ["pizza", "burger", "coffee", "food", "biryani", "snack", "momo", "sandwich"]):
        t += " food"
    return t

def full_clean(text):
    text = str(text)
    text = basic_clean(text)
    text = normalize_abbreviations(text)
    text = normalize_brands(text)
    text = enrich_keywords(text)
    text = basic_clean(text)   # final clean
    return text


In [8]:
# CELL 5 — Apply cleaning & encode labels

df['clean_merchant'] = df['merchant'].apply(full_clean)

print(df[['merchant', 'clean_merchant', 'category']].head(20))

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

num_labels = len(label_encoder.classes_)
print("\nLabel classes:", label_encoder.classes_)
print("Number of labels:", num_labels)


            merchant               clean_merchant category
0   Starbucks Coffee  starbucks coffee bills food     Food
1          McDonalds                    mcdonalds     Food
2    Subway Sandwich         subway sandwich food     Food
3          KFC Order                    kfc order     Food
4      Dominos Pizza           dominos pizza food     Food
5          Pizza Hut               pizza hut food     Food
6        Burger King             burger king food     Food
7    Cafe Coffee Day   cafe coffee day bills food     Food
8         Chai Point                   chai point     Food
9          Wow Momos              wow momoss food     Food
10         Taco Bell                    taco bell     Food
11     Dunkin Donuts                dunkin donuts     Food
12      Zomato Order                 zomato order     Food
13      Swiggy Order                 swiggy order     Food
14         FreshMenu                    freshmenu     Food
15       EatFit Meal                  eatfit meal     Fo

In [9]:
# CELL 6 — Train/validation split

X_train, X_val, y_train, y_val = train_test_split(
    df['clean_merchant'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['label'].values
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))


Train size: 381
Validation size: 96


In [10]:
# CELL 7 — Tokenizer & Dataset class

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

max_length = 32  # your texts are short; 32 is enough and fast

class TransactionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = TransactionsDataset(X_train, y_train, tokenizer, max_length)
val_dataset   = TransactionsDataset(X_val,   y_val,   tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=64, shuffle=False)

len(train_dataset), len(val_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

(381, 96)

In [11]:
# CELL 8 — Initialize DistilBERT classification model

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

model.to(device)
print("Model loaded and moved to", device)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to cpu


In [12]:
# CELL 9 — Training loop

epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            batch_labels = labels.detach().cpu().numpy()

            preds.extend(batch_preds)
            true_labels.extend(batch_labels)

    avg_loss = total_loss / len(data_loader)
    macro_f1 = f1_score(true_labels, preds, average='macro')

    return avg_loss, macro_f1, preds, true_labels

for epoch in range(epochs):
    print(f"\n===== Epoch {epoch+1}/{epochs} =====")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_f1, _, _ = eval_epoch(model, val_loader, device)

    print(f"Train loss: {train_loss:.4f}")
    print(f"Val loss:   {val_loss:.4f}")
    print(f"Val Macro F1: {val_f1:.4f}")



===== Epoch 1/3 =====
Train loss: 2.0535
Val loss:   1.9779
Val Macro F1: 0.3338

===== Epoch 2/3 =====
Train loss: 1.9049
Val loss:   1.8263
Val Macro F1: 0.5757

===== Epoch 3/3 =====
Train loss: 1.7840
Val loss:   1.7642
Val Macro F1: 0.5574
