In [9]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df = pd.read_csv("combined_dataset_train.csv")

In [5]:
df = df.drop(['Balance','Date'],axis=1)

In [11]:
df = df.dropna(subset=["Description", "Category"])


In [13]:
df.head()

Unnamed: 0,Category,Description,Withdrawal,Deposit
0,Opening,OPENING BALANCE,0,0
1,Mandatory/Utility,SALARY CRED,0,75000
2,Travel,MakeMyTrip,734,0
3,Mandatory/Utility,Credit Card Bill,1053,0
4,Non-Mandatory (Food/Grocery/Households),UPI/Subway,2051,0


In [15]:
df["Description"] = (
    df["Description"]
    .str.lower()
    .str.replace(r"[^a-zA-Z0-9\s/]", "", regex=True)
)


In [21]:
#input and output categorization 
X = df[["Description", "Withdrawal", "Deposit"]]
y = df["Category"]

In [19]:
#spliting of data to train and test

split_index = int(len(df) * 0.85)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [23]:
#text pipeline and numerric pipeline

text_features = "Description"
numeric_features = ["Withdrawal", "Deposit"]

text_transformer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    stop_words="english"
)

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, text_features),
        ("num", numeric_transformer, numeric_features)
    ]
)

In [25]:
#my model that is logistic regression is used 

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

In [27]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", model)
])
pipeline.fit(X_train, y_train)

In [29]:
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                         precision    recall  f1-score   support

                             Adjustment       1.00      1.00      1.00         3
                     Investment/Savings       1.00      1.00      1.00        12
                   Luxury/Discretionary       1.00      0.94      0.97        16
                      Mandatory/Utility       1.00      1.00      1.00        14
Non-Mandatory (Food/Grocery/Households)       0.93      1.00      0.96        13
                                Opening       1.00      1.00      1.00         2
                                 Travel       1.00      1.00      1.00        14

                               accuracy                           0.99        74
                              macro avg       0.99      0.99      0.99        74
                           weighted avg       0.99      0.99      0.99        74

[[ 3  0  0  0  0  0  0]
 [ 0 12  0  0  0  0  0]
 [ 0  0 15  0  1  0  0]
 [ 0  0  0 14  0  0  0]
 [ 0  0  0

In [31]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.9865


In [33]:
def predict_category(
    description: str,
    withdrawal: float,
    deposit: float,
    model_pipeline
):
    """
    Predict transaction category with business rules + ML.
    """

    desc = description.lower()

    # 1️ Opening balance rule
    if withdrawal == 0 and deposit == 0:
        return "Opening"

    # 2️ Refund / Adjustment rule
    if "refund" in desc or "reversal" in desc:
        return "Adjustment"

    # 3️ ML prediction
    input_df = pd.DataFrame([{
        "Description": description,
        "Withdrawal": withdrawal,
        "Deposit": deposit
    }])

    return model_pipeline.predict(input_df)[0]

In [35]:
#BATCH PREDICTION FUNCTION (FOR DATAFRAMES)


def predict_dataframe(df, model_pipeline):
    predictions = []

    for _, row in df.iterrows():
        pred = predict_category(
            description=row["Description"],
            withdrawal=row["Withdrawal"],
            deposit=row["Deposit"],
            model_pipeline=model_pipeline
        )
        predictions.append(pred)

    df = df.copy()
    df["Predicted_Category"] = predictions
    return df

In [41]:
predict_category(
    description="Crypto Purchase",
    withdrawal=5200,
    deposit=0,
    model_pipeline=pipeline
)


'Investment/Savings'

In [43]:
new_data = pd.DataFrame({
    "Description": [
        "OPENING BALANCE",
        "UPI/Uber",
        "Crypto Purchase",
        "UPI/Dmart REFUND"
    ],
    "Withdrawal": [0, 350, 5200, 0],
    "Deposit": [0, 0, 0, 4319]
})

result = predict_dataframe(new_data, pipeline)
print(result)


        Description  Withdrawal  Deposit  Predicted_Category
0   OPENING BALANCE           0        0             Opening
1          UPI/Uber         350        0              Travel
2   Crypto Purchase        5200        0  Investment/Savings
3  UPI/Dmart REFUND           0     4319          Adjustment


In [45]:
import joblib

joblib.dump(pipeline, "transaction_classifier.pkl")


['transaction_classifier.pkl']