In [1]:
# ---------------------------
# Cell 1: Imports & Load Data
# ---------------------------

import pandas as pd
import numpy as np
import re
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Load your dataset (update path if needed)
df = pd.read_csv("/content/drive/MyDrive/SmartSpend/smart-spend/data/transactions.csv")  # must contain: merchant, category
df.head()


Unnamed: 0,merchant,category
0,Starbucks Coffee,Food
1,McDonalds,Food
2,Subway Sandwich,Food
3,KFC Order,Food
4,Dominos Pizza,Food


In [2]:
# ---------------------------------
# Cell 2: Preprocessing Functions
# ---------------------------------

BRAND_NORMALIZATION = {
    "swgy": "swiggy",
    "swgy ordr": "swiggy order",
    "zomato": "zomato",
    "mcd": "mcdonalds",
    "brgr kng": "burger king",
    "k f c": "kfc",
    "domnos": "dominos",
    "dmart": "d mart",
    "big bazr": "big bazaar",
    "hpcl": "hindustan petroleum",
    "iocl": "indian oil",
    "bpcl": "bharat petroleum",
    "jio bp": "jiobp",
    "amzn": "amazon",
    "flipkart ord": "flipkart order",
    "ps store": "playstation store",
    "yt premium": "youtube premium",
    "1mg": "tata 1mg",
    "medplus": "medplus pharmacy",
}

HEALTHCARE_EXPANSION = {
    "dentl": "dental",
    "clnc": "clinic",
    "sessn": "session",
    "physio": "physiotherapy",
    "meds": "medicines",
    "dlry": "delivery",
    "opd": "outpatient department",
    "cbc": "complete blood count",
}

BILL_NORMALIZATION = {
    "elec": "electricity",
    "tneb": "electricity board",
    "ctw": "water board",
    "cc": "credit card",
    "subscrn": "subscription",
    "billfee": "bill fee",
}

def apply_dictionary(text, dictionary):
    for k, v in dictionary.items():
        if k in text:
            text = text.replace(k, v)
    return text

def clean_text(text):

    text = text.lower()

    # Apply normalizations
    text = apply_dictionary(text, BRAND_NORMALIZATION)
    text = apply_dictionary(text, HEALTHCARE_EXPANSION)
    text = apply_dictionary(text, BILL_NORMALIZATION)

    # Remove numbers, symbols
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Cleanup spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_dataframe(df):
    df = df.copy()
    df["merchant"] = df["merchant"].astype(str).apply(clean_text)
    return df

df_clean = preprocess_dataframe(df)
df_clean.head()


Unnamed: 0,merchant,category
0,starbucks coffee,Food
1,mcdonaldsonalds,Food
2,subway sandwich,Food
3,kfc order,Food
4,dominos pizza,Food


In [3]:
# ---------------------------------------
# Cell 3: Synthetic Noise Augmentation
# ---------------------------------------

def add_noise(text):
    options = []

    # Remove a random character
    if len(text) > 4:
        idx = random.randint(0, len(text)-1)
        options.append(text[:idx] + text[idx+1:])

    # Swap two characters
    if len(text) > 5:
        i = random.randint(0, len(text)-2)
        s = list(text)
        s[i], s[i+1] = s[i+1], s[i]
        options.append("".join(s))

    # Extra spaces
    options.append(text.replace(" ", "  "))

    # Repeat a word
    w = text.split()
    if len(w) > 1:
        options.append(text + " " + random.choice(w))

    return random.choice(options)

augmented_rows = []
for i, row in df_clean.iterrows():
    if random.random() < 0.30:  # Augment 30%
        augmented_rows.append([add_noise(row["merchant"]), row["category"]])

df_aug = pd.DataFrame(augmented_rows, columns=["merchant", "category"])
df_final = pd.concat([df_clean, df_aug]).reset_index(drop=True)

print("Original data:", df_clean.shape)
print("After augmentation:", df_final.shape)


Original data: (477, 2)
After augmentation: (613, 2)


In [4]:
# --------------------------------------
# Cell 4: TF-IDF Feature Engineering
# --------------------------------------

tfidf_word = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,3),
    max_features=60000,
    sublinear_tf=True
)

tfidf_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,6),
    max_features=60000
)

tfidf = FeatureUnion([
    ('word', tfidf_word),
    ('char', tfidf_char)
])

X = tfidf.fit_transform(df_final["merchant"])
y = df_final["category"]

print("TF-IDF shape:", X.shape)


TF-IDF shape: (613, 13211)


In [5]:
# ----------------------------------------------------
# Cell 5: Stratified 5-Fold Cross Validation for SVM
# ----------------------------------------------------

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    svm = LinearSVC(
        C=2.0,
        class_weight='balanced',
        random_state=42
    )

    svm.fit(X_train, y_train)
    preds = svm.predict(X_test)

    f1 = f1_score(y_test, preds, average='macro')
    fold_scores.append(f1)

    print(f"Fold {fold+1} F1: {f1:.4f}")

print("\nAverage Macro F1:", np.mean(fold_scores))


Fold 1 F1: 0.8440
Fold 2 F1: 0.8537
Fold 3 F1: 0.8310
Fold 4 F1: 0.8742
Fold 5 F1: 0.9004

Average Macro F1: 0.8606660142323379


In [6]:
# ---------------------------------------
# Cell 6: Train final SVM on full dataset
# ---------------------------------------

final_svm = LinearSVC(
    C=2.0,
    class_weight='balanced',
    random_state=42
)
final_svm.fit(X, y)

print("Final SVM model trained!")


Final SVM model trained!


In [7]:
# -------------------------
# Cell 8: Prediction helper
# -------------------------

def predict_category(text):
    cleaned = clean_text(text)
    vec = tfidf.transform([cleaned])
    pred = final_svm.predict(vec)[0]
    return pred

# Test
print(predict_category("swgy ordr #118"))
print(predict_category("hpcl pump 2211"))
print(predict_category("dentl clnc session"))


Food
Fuel
Healthcare
