In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import re

print("Starting the expense categorizer training process,")

Starting the expense categorizer training process,


In [3]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,Transaction,Category
0,zomato order 250,Food & Dining
1,swiggy food delivery,Food & Dining
2,dinner at barbeque nation,Food & Dining
3,dominos pizza order,Food & Dining
4,paradise biryani,Food & Dining
...,...,...
87,the body shop products,Personal Care
88,perfume purchase,Personal Care
89,shaving kit order,Personal Care
90,supercuts salon,Personal Care


In [None]:
df = pd.read_csv('dataset.csv')

# text cleaning
df['Clean_Transaction'] = (
    df['Transaction']
    .str.lower()                      # 1. Lowercase text
    .str.replace(r'\d+', '', regex=True) # 2. Remove numbers
    .str.replace(r'\s+', ' ', regex=True) # 3. Replace multiple spaces with one
    .str.strip()                      # 4. Remove leading/trailing spaces
)

print("Data after cleaning with pandas:")
print(df)

Data after cleaning with pandas:
           Transaction           Category    Clean_Transaction
0     zomato order 250      Food & Dining         zomato order
1      uber ride 300               Travel            uber ride
2  lic premium payment  Bills & Utilities  lic premium payment


In [5]:
# Feature Engineering (TF-IDF) 
# # Initiaize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)

# X-features and y-target vector
X = tfidf_vectorizer.fit_transform(df['Clean_Transaction'])
y = df['Category']

print(f"\nShape of TF-IDF matrix: {X.shape}")


Shape of TF-IDF matrix: (92, 183)


In [6]:
# Train-Test Split
# Remove categories with less than 2 samples
valid_categories = df['Category'].value_counts()[df['Category'].value_counts() >= 2].index
filtered_df = df[df['Category'].isin(valid_categories)]

X_filtered = tfidf_vectorizer.transform(filtered_df['Clean_Transaction'])
y_filtered = filtered_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.3, random_state=42, stratify=y_filtered)

print(f"\nTraining data size: {X_train.shape[0]} samples")
print(f"Testing data size: {X_test.shape[0]} samples")


Training data size: 63 samples
Testing data size: 28 samples


In [7]:
# Model Training 
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
print("\nModel training complete.")


Model training complete.


In [8]:
# --- 5. Model Evaluation ---
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)



Model Accuracy: 0.3929

Classification Report:
                   precision    recall  f1-score   support

Bills & Utilities       0.33      0.25      0.29         4
    Entertainment       1.00      0.25      0.40         4
    Food & Dining       0.00      0.00      0.00         4
        Groceries       1.00      0.33      0.50         3
Health & Wellness       1.00      0.33      0.50         3
    Personal Care       1.00      0.50      0.67         2
         Shopping       1.00      0.50      0.67         4
           Travel       0.21      1.00      0.35         4

         accuracy                           0.39        28
        macro avg       0.69      0.40      0.42        28
     weighted avg       0.65      0.39      0.40        28



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
# --- 6. Save the Model and Vectorizer ---
# Save the trained model and the TF-IDF vectorizer for future use
joblib.dump(model, 'expense_model.joblib')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

print("\nModel and vectorizer have been saved as 'expense_model.joblib' and 'tfidf_vectorizer.joblib'")


Model and vectorizer have been saved as 'expense_model.joblib' and 'tfidf_vectorizer.joblib'


In [10]:
# --- 7. Prediction Function Example ---
def predict_category(transaction_text):
    """
    Predicts the category for a new transaction text.
    """
    # Load the saved model and vectorizer
    loaded_model = joblib.load('expense_model.joblib')
    loaded_vectorizer = joblib.load('tfidf_vectorizer.joblib')
    
    # Clean the input text
    cleaned_text = clean_text(transaction_text)
    
    # Vectorize the text using the loaded vectorizer
    text_vectorized = loaded_vectorizer.transform([cleaned_text])
    
    # Predict the category
    prediction = loaded_model.predict(text_vectorized)
    
    return prediction[0]

In [11]:
# --- Example Usage ---
print("\n--- Testing the prediction function ---")
new_transaction_1 = "uber ride home 230"
new_transaction_2 = "dinner at paradise hotel"

print(f"'{new_transaction_1}' is categorized as: {predict_category(new_transaction_1)}")
print(f"'{new_transaction_2}' is categorized as: {predict_category(new_transaction_2)}")


--- Testing the prediction function ---
'uber ride home 230' is categorized as: Travel
'dinner at paradise hotel' is categorized as: Food & Dining
