In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import re


## 1. Preprocessing Function


In [2]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    return ""

## 2. Load Datasets

In [5]:
train_df = pd.read_csv("../data/transactions.csv")
test_clean_df = pd.read_csv("../data/test_clean.csv")
test_noisy_df = pd.read_csv("../data/test_noisy.csv")

# Apply cleaning
train_df["clean"] = train_df["text"].apply(clean_text)
test_clean_df["clean"] = test_clean_df["text"].apply(clean_text)
test_noisy_df["clean"] = test_noisy_df["text"].apply(clean_text)

## 3. Split Train/Validation

In [6]:
X = train_df["clean"]
y = train_df["category"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training rows:", len(X_train))
print("Validation rows:", len(X_val))

Training rows: 381
Validation rows: 96


## 4. Build Model Pipeline

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression(max_iter=300)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

model.fit(X_train_vec, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,300


## 5. Validation Evaluation


In [8]:
val_pred = model.predict(X_val_vec)
print("\n=== Validation Performance ===\n")
print(classification_report(y_val, val_pred))


=== Validation Performance ===

               precision    recall  f1-score   support

        Bills       0.00      0.00      0.00         2
         Fuel       0.00      0.00      0.00         2
        Bills       0.58      0.70      0.64        10
Entertainment       0.73      0.67      0.70        12
         Food       0.57      1.00      0.73        12
         Fuel       0.88      0.70      0.78        10
    Groceries       1.00      0.42      0.59        12
   Healthcare       0.88      0.58      0.70        12
     Shopping       0.69      0.92      0.79        12
       Travel       0.73      0.92      0.81        12

     accuracy                           0.71        96
    macro avg       0.61      0.59      0.57        96
 weighted avg       0.73      0.71      0.69        96



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 6. Evaluate on CLEAN TEST DATASET

In [9]:
print("\n=== Clean Unseen Test Dataset ===\n")
X_test_clean_vec = vectorizer.transform(test_clean_df["clean"])
test_clean_pred = model.predict(X_test_clean_vec)
print(classification_report(test_clean_df["category"], test_clean_pred))



=== Clean Unseen Test Dataset ===

               precision    recall  f1-score   support

        Bills       1.00      0.60      0.75         5
Entertainment       1.00      1.00      1.00         5
         Food       0.71      1.00      0.83         5
         Fuel       1.00      1.00      1.00         5
    Groceries       1.00      0.80      0.89         5
   Healthcare       0.83      1.00      0.91         5
     Shopping       0.75      0.60      0.67         5
       Travel       0.83      1.00      0.91         5

     accuracy                           0.88        40
    macro avg       0.89      0.88      0.87        40
 weighted avg       0.89      0.88      0.87        40



## 7. Evaluate on EXTREME NOISY TEST DATASET

In [10]:
print("\n=== Extreme Noisy Test Samples ===\n")
X_test_noisy_vec = vectorizer.transform(test_noisy_df["clean"])
test_noisy_pred = model.predict(X_test_noisy_vec)

# Show predictions with confidence scores
probas = model.predict_proba(X_test_noisy_vec)

for i in range(len(test_noisy_df)):
    text = test_noisy_df["text"].iloc[i]
    pred = test_noisy_pred[i]
    confidence = round(probas[i].max(), 3)
    print(f"Input: {text}\n → Predicted: {pred}  | Confidence: {confidence}\n")



=== Extreme Noisy Test Samples ===

Input: strbks cf#21
 → Predicted: Food  | Confidence: 0.141

Input: mcd brkfst
 → Predicted: Food  | Confidence: 0.205

Input: zmt ord#999
 → Predicted: Healthcare  | Confidence: 0.162

Input: pzz hut blk88
 → Predicted: Food  | Confidence: 0.324

Input: swgy-fd
 → Predicted: Food  | Confidence: 0.208

Input: amzn ord rm889
 → Predicted: Shopping  | Confidence: 0.188

Input: flpkt itm#22
 → Predicted: Groceries  | Confidence: 0.164

Input: nykaa cos#11
 → Predicted: Shopping  | Confidence: 0.159

Input: dmrt mkt
 → Predicted: Food  | Confidence: 0.141

Input: frstcry kdsp
 → Predicted: Food  | Confidence: 0.141

Input: ubr trp#12
 → Predicted: Food  | Confidence: 0.141

Input: ola cr 9p
 → Predicted: Travel  | Confidence: 0.298

Input: mmt flt6e
 → Predicted: Food  | Confidence: 0.141

Input: mtr rcg 77
 → Predicted: Healthcare  | Confidence: 0.16

Input: rstc tckt
 → Predicted: Entertainment  | Confidence: 0.174

Input: jio rcg149
 → Predicted: Foo

## 8. Save Model + Vectorizer

In [None]:
joblib.dump(vectorizer, "../model/tfidf.pkl")
joblib.dump(model, "../model/logreg_model.pkl")
print("\nModel & vectorizer saved successfully!")