In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import warnings

# Silence any stray pandas warnings as the FutureWarning was being created which was silenced through this.
warnings.simplefilter(action="ignore", category=FutureWarning)

In [9]:
df = pd.read_csv('transactions_datasets.csv')

df["datetime"] = pd.to_datetime(
    df["datetime"],
    format="%Y-%m-%d %H:%M:%S,%f",
    errors="raise"            # will error if any row doesn't match the format
)

In [10]:
def map_to_category(desc: str) -> str:
    primary = desc.split(',')[0].strip().lower()
    if any(k in primary for k in ['utilities', 'bill', 'electricity', 'water', 'internet']):
        return 'Utilities'
    if any(k in primary for k in ['movie', 'theater', 'circus', 'theme park', 'live match', 'concert']):
        return 'Entertainment'
    if any(k in primary for k in ['food', 'snacks', 'coffee', 'restaurant', 'meal']):
        return 'Food'
    if any(k in primary for k in ['buying', 'shopping', 'book', 'gift', 'online']):
        return 'Shopping'
    if any(k in primary for k in ['bus ticket', 'aeroplane ticket', 'train ticket', 'ride share', 'hotel booking', 'booking']):
        return 'Travelling'
    if any(k in primary for k in ['topup', 'recharge', 'mobile topup']):
        return 'TopUp'
    if any(k in primary for k in ['subscription', 'gift', 'charity donation', 'parking fee', 'loan']):
        return 'Bill Split'
    return 'Others'

df2 = df.copy()
df2['label'] = df2['description'].apply(map_to_category)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df2['description'],
    df2['label'],
    test_size=0.20,
    random_state=42,
    stratify=df2['label']
)

In [12]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), min_df=2)),
    ("clf", LogisticRegression(max_iter=1000))
])

param_grid = {
    "tfidf__max_df": [0.8, 1.0],
    "clf__C": [0.1, 1.0, 10.0]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1_macro",
    n_jobs=1              # ← avoid Loky resource_tracker issues as it arises due to parallel processing issue
)

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

   Bill Split       1.00      1.00      1.00        75
Entertainment       1.00      1.00      1.00        84
         Food       1.00      1.00      1.00        36
     Shopping       1.00      1.00      1.00        92
        TopUp       1.00      1.00      1.00        19
   Travelling       1.00      1.00      1.00        75
    Utilities       1.00      1.00      1.00        19

     accuracy                           1.00       400
    macro avg       1.00      1.00      1.00       400
 weighted avg       1.00      1.00      1.00       400



In [13]:
# Now testing the model with an example description
example_desc = "Buying a t-shirt"
pred = grid.predict([example_desc])[0]
print(f"Description: '{example_desc}' → Predicted category: {pred}")

Description: 'Buying a t-shirt' → Predicted category: Shopping


In [14]:
print("The Logistic Regression with label encoded with the category for Supervised learning is working great.")

The Logistic Regression with label encoded with the category for Supervised learning is working great.


# It has performed better than Clustering and we were able to do it till the final so it is great, Created by Rohan Thapa.