In [1]:
import random
import re
import csv

INPUT_TXT = "/content/textcategory.txt"      # your txt file
OUTPUT_CSV = "transactions_noisy.csv"
TARGET_ROWS = 5000

noise_prefix = [
    "", "upi", "payment", "txn", "sent", "online", "upi txn", "paid"
]

noise_suffix = [
    "", "payment", "txn", "transfer", "via upi", "ref"
]

def clean_text(text):
    text = text.replace("/", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def add_noise(text):
    text = clean_text(text)

    # remove numbers randomly
    if random.random() < 0.4:
        text = re.sub(r"\d+", "", text)

    # remove UPI randomly
    if random.random() < 0.4:
        text = text.replace("UPI", "").replace("upi", "")

    words = text.split()

    # shuffle words sometimes
    if random.random() < 0.3 and len(words) > 2:
        random.shuffle(words)

    text = " ".join(words)

    text = f"{random.choice(noise_prefix)} {text} {random.choice(noise_suffix)}"
    return re.sub(r"\s+", " ", text).strip().lower()


rows = []

with open(INPUT_TXT, "r", encoding="utf-8") as f:
    lines = f.readlines()

base_data = []
for line in lines:
    line = line.strip()
    if not line or "," not in line:
        continue
    text, category = line.rsplit(",", 1)
    base_data.append((text.strip(), category.strip()))

while len(rows) < TARGET_ROWS:
    text, category = random.choice(base_data)
    noisy_text = add_noise(text)
    rows.append([noisy_text, category])

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["text", "category"])
    writer.writerows(rows)

print(f"‚úÖ Generated {len(rows)} noisy rows ‚Üí {OUTPUT_CSV}")


‚úÖ Generated 5000 noisy rows ‚Üí transactions_noisy.csv


In [4]:
import pandas as pd
import re

df = pd.read_csv("/content/transactions_noisy.csv")

# 2. DEFINE THE CLEANING FUNCTION
def clean_bank_text(text):
    # A. Convert to lowercase
    text = text.lower()

    # B. Remove numeric IDs (sequences of numbers usually > 3 digits)
    # This removes UPI IDs, Phone numbers, Account numbers
    text = re.sub(r'\d+', '', text)

    # C. Remove Special Characters (keep only letters and spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # D. Define "Financial Stop Words"
    # These words appear in almost every transaction and confuse the model
    stop_words = [
        'upi', 'txn', 'payment', 'transfer', 'via', 'sent', 'paid',
        'ref', 'bill', 'online', 'to', 'bank', 'from', 'recharge', 'using', 'payt'
    ]

    # Filter out the stop words
    words = text.split()
    cleaned_words = [w for w in words if w not in stop_words]

    # E. Join back together
    return " ".join(cleaned_words)

# 3. APPLY THE CLEANING
df['cleaned_text'] = df['text'].apply(clean_bank_text)

# 4. REMOVE EMPTY ROWS
# Sometimes a row is just "UPI TXN" and becomes empty after cleaning. We drop those.
df = df[df['cleaned_text'] != '']

# Display side-by-side comparison
pd.set_option('display.max_colwidth', None)
print(df.head(15))

                                                         text     category  \
0                                      payment dth sun direct    Utilities   
1        upi txn stores 109908735413 upi swadeshi upi via upi    Groceries   
2                             txn upi indian railways upi txn    Utilities   
3              sent toll 110847936125 electronic city via upi    Transport   
4            upi basket upi nature‚Äôs upi 109908735413 via upi    Groceries   
5                    upi mtc chennai 110847936109 upi via upi    Transport   
6                                     txn upi gym upi payment    Utilities   
7                   sent balaji upi upi grocery mart transfer    Groceries   
8            upi txn balaji grocery mart 109908735413 via upi    Groceries   
9                                     upi ashwin cool bar upi  Food_Dining   
10                                 payment upi spar india upi    Groceries   
11                         upi ola share 110847936118 via upi 

In [5]:
df

Unnamed: 0,text,category,cleaned_text
0,payment dth sun direct,Utilities,dth sun direct
1,upi txn stores 109908735413 upi swadeshi upi via upi,Groceries,stores swadeshi
2,txn upi indian railways upi txn,Utilities,indian railways
3,sent toll 110847936125 electronic city via upi,Transport,toll electronic city
4,upi basket upi nature‚Äôs upi 109908735413 via upi,Groceries,basket natures
...,...,...,...
4995,payment upi all-in-one mart 109908735413 upi txn,Groceries,allinone mart
4996,upi redbus ticket 110847936111 upi,Transport,redbus ticket
4997,upi sachin dhaba upi transfer,Food_Dining,sachin dhaba
4998,upi spc mart 109232084934 payment,Groceries,spc mart


In [6]:
from sklearn.model_selection import train_test_split

# 1. Define your Features (X) and Target (y)
X = df['cleaned_text']  # The input text
y = df['category']      # The labels (Food, Utilities, etc.)

# 2. Perform the Split
# test_size=0.2 means 20% of data is saved for testing
# random_state=42 ensures the split is the same every time you run it (reproducibility)
# stratify=y is CRITICAL here. It ensures if you have 10% 'Utilities' in total,
# your test set also gets 10% 'Utilities'. It keeps the balance.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 3. Verify the Split
print(f"Total Data: {len(df)}")
print(f"Training Data (X_train): {len(X_train)} rows")
print(f"Test Data (X_test): {len(X_test)} rows")

print("-" * 30)
print("Example of Training Data:")
print(X_train.head(3))

Total Data: 5000
Training Data (X_train): 4000 rows
Test Data (X_test): 1000 rows
------------------------------
Example of Training Data:
558     nikhil jain
3467            kfc
1506    eb tangedco
Name: cleaned_text, dtype: object


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Initialize the TF-IDF Vectorizer
# We configure it to capture single words (unigrams) and two-word phrases (bigrams).
# The max_features parameter limits the vocabulary size, which is good for speed and reducing noise.
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Consider both single words (unigrams) and 2-word pairs (bigrams)
    max_features=5000    # Use the 5000 most frequently used and unique features
)

# 2. Fit and Transform the Training Data
# The .fit() method learns the vocabulary and IDF weights ONLY from the training data.
X_train_features = vectorizer.fit_transform(X_train)

# 3. Transform the Test Data
# The .transform() method applies the vocabulary and weights learned in step 2 to the test data.
# CRITICAL: DO NOT use .fit_transform() on the test data!
X_test_features = vectorizer.transform(X_test)

# 4. Verification
print(f"Vocabulary size (number of features): {len(vectorizer.get_feature_names_out())}")
print(f"X_train Feature Matrix Shape: {X_train_features.shape}")
print(f"X_test Feature Matrix Shape: {X_test_features.shape}")

# Example of a feature name learned (showing n-grams):
print("\nExample of learned features (n-grams):")
print(vectorizer.get_feature_names_out()[200:205])

Vocabulary size (number of features): 984
X_train Feature Matrix Shape: (4000, 984)
X_test Feature Matrix Shape: (1000, 984)

Example of learned features (n-grams):
['coffee nellai' 'cool' 'cool bar' 'course' 'creamy']


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Initialize the Model
# Use a simple, robust classifier
model = LogisticRegression(
    solver='liblinear', # Good choice for small to medium datasets
    random_state=42,
    C=1.0 # C is the inverse of regularization strength; smaller C means stronger regularization.
)

# 2. Train the Model
# This is where the model learns the patterns from the numerical data
print("\nStarting Model Training...")
model.fit(X_train_features, y_train)
print("Model Training Complete.")

# 3. Make Predictions on the Test Set
# We use the test set (the unseen data) to see how well the model works.
y_pred = model.predict(X_test_features)

# 4. Evaluate the Model Performance (Initial Peek)
print("-" * 30)
print(f"Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")


Starting Model Training...
Model Training Complete.
------------------------------
Accuracy on Test Set: 0.9980


In [10]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Define K-Fold Cross-Validation
# A common choice is k=5 or k=10. We will use k=5 for a good balance of speed and robustness.
# shuffle=True ensures the data is mixed before folding.
k_folds = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. Perform Cross-Validation ---

print("Starting 5-Fold Cross-Validation...")

# Use the cross_val_score function to fit the model 5 times,
# each time on 4/5 of the training data and scoring on the remaining 1/5.
cv_scores = cross_val_score(
    estimator=model,
    X=X_train_features,
    y=y_train,
    cv=k_folds,
    scoring='accuracy', # We will use accuracy for comparison
    n_jobs=-1 # Use all available CPU cores for speed
)

# --- 3. Analyze Results ---

print("Cross-Validation Complete.")
print("-" * 50)
print(f"Individual CV Fold Accuracies: {cv_scores}")
print(f"Mean CV Accuracy:              {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of CV:      {np.std(cv_scores):.4f}")
print("-" * 50)

# If Mean CV Accuracy is significantly lower than Test Set Accuracy (0.9980),
# then the model is likely overfitting or the test set was too easy/lucky.
if np.mean(cv_scores) < 0.99:
    print("\n‚úÖ Diagnosis: Overfitting is confirmed, or the data has high variance.")
    print("Action: Consider regularization (reduce C), dropout, or more data augmentation.")
elif np.std(cv_scores) > 0.005:
    print("\n‚ö†Ô∏è Diagnosis: High variance across folds suggests the model's stability is poor.")
else:
    print("\nüí° Diagnosis: High CV score suggests the model generalizes well to different training partitions.")

Starting 5-Fold Cross-Validation...
Cross-Validation Complete.
--------------------------------------------------
Individual CV Fold Accuracies: [0.9975  0.99125 0.99375 0.995   0.99125]
Mean CV Accuracy:              0.9938
Standard Deviation of CV:      0.0024
--------------------------------------------------

üí° Diagnosis: High CV score suggests the model generalizes well to different training partitions.


In [12]:


print("F1-Weighted Cross-Validation Complete.")
print("-" * 50)
print(f"Individual CV Fold F1-Scores:  {cv_scores}")
print(f"Mean CV F1-Score:              {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of CV F1:   {np.std(cv_scores):.4f}")
print("-" * 50)

F1-Weighted Cross-Validation Complete.
--------------------------------------------------
Individual CV Fold F1-Scores:  [0.9975  0.99125 0.99375 0.995   0.99125]
Mean CV F1-Score:              0.9938
Standard Deviation of CV F1:   0.0024
--------------------------------------------------


In [14]:

from sklearn.metrics import classification_report

# 2. Predict on the held-out test set
# (Assuming X_test_features and y_test are available)
y_pred = model.predict(X_test_features)

# --- Retrieve the Category Mapping for Readability ---
# You need the category names (e.g., 'Groceries', 'Utilities') instead of IDs (0, 1, 2)
# Since the category names are not explicitly defined in the provided data,
# we'll map back from the numerical codes in y_test/y_train to the string labels.

# This function reconstructs the map if you don't have it saved:
def get_label_names(y_true):
    # This relies on y_true being the original string labels before encoding
    # Since y_test here is numerical (category IDs), we need the original df.

    # ***ASSUMPTION: We use the unique labels found in the entire dataset for reporting***
    # Replace this placeholder with the actual category names from your original data load!
    if 'category' in df.columns:
        return sorted(df['category'].unique())
    else:
        # Placeholder names if original categories are unknown (will show numbers)
        return [f"Category {i}" for i in sorted(y_true.unique())]

# Get the list of unique category names in the dataset
target_names = get_label_names(y_test)


# 3. Print the Classification Report
print("\n" + "=" * 60)
print("                 FINAL CLASSIFICATION REPORT (ON HELD-OUT TEST SET)")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=target_names))
print("=" * 60)


                 FINAL CLASSIFICATION REPORT (ON HELD-OUT TEST SET)
              precision    recall  f1-score   support

 Food_Dining       1.00      1.00      1.00       218
   Groceries       1.00      1.00      1.00       198
   Transfers       1.00      1.00      1.00       224
   Transport       0.99      1.00      0.99       182
   Utilities       1.00      0.99      0.99       171
    category       1.00      1.00      1.00         7

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [15]:
import joblib

# Save the model
joblib.dump(model, 'financial_classifier_model.pkl')

# Save the vectorizer (CRITICAL for consistent features)
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']