In [1]:
categories = [
    "Food & Dining",
    "Groceries",
    "Fuel",
    "Shopping",
    "Entertainment",
    "Bills & Utilities",
    "Healthcare",
    "Travel",
    "Education",
    "Rent",
    "Insurance",
    "Transfers"
]


In [2]:
transaction_patterns = {
    "Food & Dining": ["Starbucks", "Dominos", "Zomato", "Swiggy", "KFC", "Subway"],
    "Groceries": ["Big Bazaar", "DMart", "Reliance Fresh", "More Supermarket", "Spencer's"],
    "Fuel": ["HP Gas", "IndianOil", "Shell", "BPCL", "IOCL Pump"],
    "Shopping": ["Amazon", "Flipkart", "Myntra", "Ajio", "Zara", "H&M"],
    "Entertainment": ["Netflix", "Hotstar", "Spotify", "BookMyShow", "PVR Cinemas"],
    "Bills & Utilities": ["Paytm Recharge", "BSNL Bill", "Airtel", "Tata Power", "Water Bill"],
    "Healthcare": ["Apollo Pharmacy", "MedPlus", "1mg", "Fortis Hospital", "Pathlab"],
    "Travel": ["IRCTC", "Uber", "Ola", "MakeMyTrip", "IndiGo", "Air India"],
    "Education": ["Coursera", "Udemy", "Byjus", "Unacademy", "School Fee"],
    "Rent": ["Rent Payment", "Flat Rent", "Landlord", "PG Fee"],
    "Insurance": ["LIC", "HDFC Life", "ICICI Prudential", "Bajaj Allianz"],
    "Transfers": ["NEFT Transfer", "IMPS", "UPI to Friend", "Account Transfer"]
}


In [3]:
import random
import pandas as pd
from datetime import datetime, timedelta

# Categories and merchants
transaction_patterns = {
    "Food & Dining": ["Starbucks", "Dominos", "Zomato", "Swiggy", "KFC", "Subway", "Cafe Coffee Day"],
    "Groceries": ["Big Bazaar", "DMart", "Reliance Fresh", "More Supermarket", "Spencer's", "Nature's Basket"],
    "Fuel": ["HP Gas", "IndianOil", "Shell", "BPCL", "IOCL Pump"],
    "Shopping": ["Amazon", "Flipkart", "Myntra", "Ajio", "Zara", "H&M", "Nykaa"],
    "Entertainment": ["Netflix", "Hotstar", "Spotify", "BookMyShow", "PVR Cinemas"],
    "Bills & Utilities": ["Paytm Recharge", "BSNL Bill", "Airtel", "Tata Power", "Water Bill", "Electricity Board"],
    "Healthcare": ["Apollo Pharmacy", "MedPlus", "1mg", "Fortis Hospital", "Pathlab", "CureFit"],
    "Travel": ["IRCTC", "Uber", "Ola", "MakeMyTrip", "IndiGo", "Air India", "RedBus"],
    "Education": ["Coursera", "Udemy", "Byjus", "Unacademy", "School Fee", "College Fee"],
    "Rent": ["Rent Payment", "Flat Rent", "Landlord", "PG Fee"],
    "Insurance": ["LIC", "HDFC Life", "ICICI Prudential", "Bajaj Allianz", "PolicyBazaar"],
    "Transfers": ["NEFT Transfer", "IMPS", "UPI to Friend", "Account Transfer", "Money Sent"]
}

def add_noise(text):
    """Add small noise to text for realism."""
    # Random capitalization and typo-like variations
    text = ''.join(
        c.upper() if random.random() < 0.2 else c.lower()
        for c in text
    )
    if random.random() < 0.1:
        text = text.replace('a', '@').replace('o', '0')
    return text

# Generate data
records = []
start_date = datetime(2024, 1, 1)
for category, merchants in transaction_patterns.items():
    for _ in range(800):  # ~9600 total rows
        merchant = random.choice(merchants)
        txn_type = random.choice(["POS", "UPI", "NEFT", "IMPS", "CARD"])
        txn_id = random.randint(1000, 9999)
        amount = round(random.uniform(50, 5000), 2)
        date = start_date + timedelta(days=random.randint(0, 300))
        desc = f"{merchant} {txn_type} TXN{txn_id}"
        desc = add_noise(desc)
        records.append({
            "transaction_id": len(records)+1,
            "description": desc,
            "amount": amount,
            "date": date.strftime("%Y-%m-%d"),
            "category": category
        })

# Create dataframe and shuffle
df = pd.DataFrame(records)
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv("synthetic_transactions.csv", index=False)
print("‚úÖ Synthetic dataset created: synthetic_transactions.csv")
print(f"Total samples: {len(df)}")
print(df.head())


‚úÖ Synthetic dataset created: synthetic_transactions.csv
Total samples: 9600
   transaction_id                   description   amount        date  \
0             234            swiggy pos txn9956   508.39  2024-02-26   
1            1845         indIAnoil upi txn8500  1336.81  2024-07-15   
2            6011             irctc pos txn1660  2817.48  2024-04-04   
3             761              kFc impS txn9597  3619.23  2024-06-24   
4            5530  apoLlo pharmAcy imps Txn1793  4661.22  2024-07-16   

        category  
0  Food & Dining  
1           Fuel  
2         Travel  
3  Food & Dining  
4     Healthcare  


In [4]:
import pandas as pd
import numpy as np
import re
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score


In [5]:
df = pd.read_csv("synthetic_transactions.csv")
print("‚úÖ Dataset Loaded Successfully!")
print(df.head(), "\n")
print("Total Samples:", len(df))

‚úÖ Dataset Loaded Successfully!
   transaction_id                   description   amount        date  \
0             234            swiggy pos txn9956   508.39  2024-02-26   
1            1845         indIAnoil upi txn8500  1336.81  2024-07-15   
2            6011             irctc pos txn1660  2817.48  2024-04-04   
3             761              kFc impS txn9597  3619.23  2024-06-24   
4            5530  apoLlo pharmAcy imps Txn1793  4661.22  2024-07-16   

        category  
0  Food & Dining  
1           Fuel  
2         Travel  
3  Food & Dining  
4     Healthcare   

Total Samples: 9600


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

df['clean_description'] = df['description'].apply(clean_text)


In [7]:
# 3Ô∏è‚É£ SPLIT INTO TRAIN / TEST
# --------------------------------------------------------------
X = df['clean_description']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Training samples: 7680, Test samples: 1920


In [8]:
# 4Ô∏è‚É£ TF-IDF VECTORIZATION
# --------------------------------------------------------------
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# --------------------------------------------------------------
# 5Ô∏è‚É£ MODEL TRAINING (Logistic Regression)
# --------------------------------------------------------------
model = LogisticRegression(max_iter=1000, solver='lbfgs', C=5.0)
model.fit(X_train_vec, y_train)


In [9]:
y_pred = model.predict(X_test_vec)

print("\nüìä Classification Report:\n")
print(classification_report(y_test, y_pred, digits=3))

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"\n‚úÖ Test Accuracy: {acc*100:.2f}%")
print(f"‚úÖ Weighted F1 Score: {f1*100:.2f}%")



üìä Classification Report:

                   precision    recall  f1-score   support

Bills & Utilities      1.000     1.000     1.000       160
        Education      1.000     1.000     1.000       160
    Entertainment      1.000     0.988     0.994       160
    Food & Dining      1.000     1.000     1.000       160
             Fuel      1.000     0.994     0.997       160
        Groceries      1.000     1.000     1.000       160
       Healthcare      1.000     1.000     1.000       160
        Insurance      1.000     1.000     1.000       160
             Rent      1.000     0.994     0.997       160
         Shopping      0.970     1.000     0.985       160
        Transfers      1.000     0.994     0.997       160
           Travel      0.994     0.994     0.994       160

         accuracy                          0.997      1920
        macro avg      0.997     0.997     0.997      1920
     weighted avg      0.997     0.997     0.997      1920


‚úÖ Test Accuracy: 99.

In [10]:
# 7Ô∏è‚É£ TEST MODEL ON UNSEEN / UNKNOWN DATA
# --------------------------------------------------------------
test_samples = [
    "Starbucks POS TXN2345",
    "Netflix Monthly Subscription",
    "Apollo Pharmacy Bill",
    "IRCTC Ticket Booking",
    "LIC Annual Premium",
    "Amazon Order TXN4421",
    "Uber Ride Payment"
]

test_samples_clean = [clean_text(t) for t in test_samples]
test_vec = vectorizer.transform(test_samples_clean)
preds = model.predict(test_vec)

print("\nüöÄ Model Predictions on Unknown Transactions:")
for t, p in zip(test_samples, preds):
    print(f"{t:<40} ‚Üí {p}")


üöÄ Model Predictions on Unknown Transactions:
Starbucks POS TXN2345                    ‚Üí Food & Dining
Netflix Monthly Subscription             ‚Üí Entertainment
Apollo Pharmacy Bill                     ‚Üí Healthcare
IRCTC Ticket Booking                     ‚Üí Travel
LIC Annual Premium                       ‚Üí Insurance
Amazon Order TXN4421                     ‚Üí Shopping
Uber Ride Payment                        ‚Üí Travel


In [11]:
# 8Ô∏è‚É£ SAVE MODEL + VECTORIZER
# --------------------------------------------------------------
joblib.dump(model, "transaction_classifier_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\nüíæ Model and Vectorizer Saved Successfully!")
print("Files generated: transaction_classifier_model.pkl, tfidf_vectorizer.pkl")


üíæ Model and Vectorizer Saved Successfully!
Files generated: transaction_classifier_model.pkl, tfidf_vectorizer.pkl
