# Text Classification Using The LSTM Deep Learning Model

## Import necesarry Libraries

In [None]:
!pip install sastrawi pandas numpy tensorflow scikit-learn

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pickle
import re
import os

## Deleting Model Artifacts

In [6]:
artifacts = ['model_artifacts/finance_model.h5', 'model_artifacts/tokenizer.pkl', 'model_artifacts/label_mappings.pkl']

for file in artifacts:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed previous {file}")

Removed previous model_artifacts/finance_model.h5
Removed previous model_artifacts/tokenizer.pkl
Removed previous model_artifacts/label_mappings.pkl


## Text Preprocessing

In [7]:
factory = StemmerFactory()
lemmatizer = factory.create_stemmer()

FINANCIAL_TERMS = {
    # Payment methods
    "gopay": "gopay", "ovo": "ovo", "dana": "dana", "shopeepay": "shopeepay",
    # Banks
    "bca": "bca", "bni": "bni", "bri": "bri", "mandiri": "mandiri",
    # Financial terms
    "kpr": "kpr", "atm": "atm", "rekening": "rekening", "deposito": "deposito",
    # Providers
    "pln": "pln", "pdam": "pdam", "telkomsel": "telkomsel", "indihome": "indihome",
    # Currencies
    "rp": "rp", "juta": "juta", "ribu": "ribu",
    # Modern terms (NEW)
    "bibit": "investasi", "pluang": "investasi", "stockbit": "investasi",
    "fitness": "gym", "center": "gym", "membership": "member",
    "top up": "isi ulang", "invest": "investasi", "saham": "investasi"
}

def preprocess_text(text):
    """Enhanced preprocessing with English loanword handling"""
    # Convert to lowercase
    text = text.lower()

    # Replace English financial terms (NEW)
    replacements = {
        'membership': 'member',
        'fitness center': 'gym',
        'bibit': 'aplikasi investasi',
        'top up': 'isi ulang',
        'invest': 'investasi'
    }
    for eng, ind in replacements.items():
        text = text.replace(eng, ind)

    # Remove special chars
    text = re.sub(r'[^\w\s]', '', text)

    # Token preservation and lemmatization
    tokens = []
    for token in text.split():
        if token in FINANCIAL_TERMS:
            tokens.append(FINANCIAL_TERMS[token])
        else:
            stemmed = lemmatizer.stem(token)
            tokens.append(stemmed)

    return ' '.join(tokens)

## Loading Data

In [8]:
def load_and_augment_data(csv_path):
    """Load data with modern transaction augmentation"""
    df = pd.read_csv(csv_path)

    # Add modern transactions (NEW EXAMPLES)
    modern_transactions = [
        ["Bayar member gym bulanan", "Health & Fitness"],
        ["Pembayaran fitness center", "Health & Fitness"],
        ["Investasi saham via Bibit", "Savings & Investments"],
        ["Transfer BCA untuk cicilan rumah", "Debt & Loans"],
        ["Top up investasi Bibit 1jt", "Savings & Investments"],
        ["Belanja saham di Stockbit", "Savings & Investments"],
        ["Bayar premi asuransi", "Utilities"],
        ["Pembelian emas di Pluang", "Savings & Investments"],
        ["Isi saldo Bibit", "Savings & Investments"],
        ["Langganan gym premium", "Health & Fitness"]
    ]

    modern_df = pd.DataFrame(modern_transactions, columns=['text', 'label'])
    df = pd.concat([df, modern_df])

    # Preprocess all text
    df['processed_text'] = df['text'].apply(preprocess_text)
    return df

## Model Definition

In [9]:
def create_lstm_model(vocab_size, num_classes, max_len):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True, input_length=max_len),
        Bidirectional(LSTM(128, return_sequences=True, dropout=0.2)),
        Bidirectional(LSTM(64, dropout=0.2)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')]
    )
    return model

## Model Training

In [10]:
def train_model(df, model_save_path='model_artifacts/finance_model.h5'):
    # Prepare labels
    label2id = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
    id2label = {i: label for label, i in label2id.items()}
    y = pd.get_dummies(df['label'].map(label2id)).values

    # Tokenization
    tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>", filters='')
    tokenizer.fit_on_texts(df['processed_text'])
    vocab_size = len(tokenizer.word_index) + 1

    # Sequence preparation
    sequences = tokenizer.texts_to_sequences(df['processed_text'])
    max_len = max(len(seq) for seq in sequences)
    X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    # Class weights
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(df['label'].map(label2id)),
        y=df['label'].map(label2id)
    )
    class_weights = dict(enumerate(class_weights))

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=df['label']
    )

    # Model training
    model = create_lstm_model(vocab_size, len(label2id), max_len)

    callbacks = [
        EarlyStopping(patience=5, restore_best_weights=True),
        ModelCheckpoint(model_save_path, save_best_only=True)
    ]

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        class_weight=class_weights,
        callbacks=callbacks
    )

    # Save artifacts
    with open('model_artifacts/tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('model_artifacts/label_mappings.pkl', 'wb') as handle:
        pickle.dump((label2id, id2label), handle, protocol=pickle.HIGHEST_PROTOCOL)

    return model, tokenizer, label2id, id2label, max_len

## Model Evaluation and Testing

In [11]:
class FinancialClassifier:
    def __init__(self, model_path='model_artifacts/finance_model.h5'):
        self.model = load_model(model_path)
        with open('model_artifacts/tokenizer.pkl', 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        with open('model_artifacts/label_mappings.pkl', 'rb') as handle:
            self.label2id, self.id2label = pickle.load(handle)
        self.max_len = self.model.input_shape[1]

    def predict(self, text, confidence_threshold=0.7):
        # Enhanced preprocessing
        processed = preprocess_text(text)
        seq = self.tokenizer.texts_to_sequences([processed])
        padded = pad_sequences(seq, maxlen=self.max_len, padding='post', truncating='post')

        # Predict
        proba = self.model.predict(padded, verbose=0)[0]
        pred_id = np.argmax(proba)
        confidence = proba[pred_id]

        # Prepare results
        result = {
            'original_text': text,
            'processed_text': processed,
            'prediction': self.id2label[pred_id],
            'confidence': float(confidence),
            'all_predictions': {
                self.id2label[i]: float(p)
                for i, p in enumerate(proba)
                if p > 0.05
            }
        }

        # Apply confidence threshold
        if confidence < confidence_threshold:
            result['prediction'] = 'LOW_CONFIDENCE'
            result['suggestion'] = 'Needs manual review'

        return result

In [12]:
if __name__ == "__main__":
    try:
        # 1. Load and augment data
        df = load_and_augment_data('datasets/transactions.csv')
        print(f"Loaded {len(df)} samples with {len(df['label'].unique())} categories")

        # 2. Train the model
        print("\nTraining model...")
        model, tokenizer, label2id, id2label, max_len = train_model(df)

        # 3. Initialize classifier
        classifier = FinancialClassifier()

        # 4. Test predictions
        test_cases = [
            "Gaji bulan Desember dari kantor",
            "Bayar tagihan listrik PLN",
            "Beli makan siang di warteg",
            "Transfer ke BCA untuk angsuran KPR",
            "Isi pulsa Telkomsel 100rb",
            "Investasi saham di Bibit",
            "Bayar membership fitness center",
            "Donasi untuk korban bencana",
            "Top up Bibit 500rb",
            "Langganan gym premium"
        ]

        print("\nEnhanced Test Predictions:")
        for text in test_cases:
            result = classifier.predict(text)
            print(f"\nOriginal: {result['original_text']}")
            print(f"Processed: {result['processed_text']}")
            print(f"Prediction: {result['prediction']} (Confidence: {result['confidence']:.2%})")
            if 'suggestion' in result:
                print(f"⚠️ {result['suggestion']}")
            print("Details:")
            for cat, prob in result['all_predictions'].items():
                if prob > 0.05:
                    print(f"- {cat}: {prob:.2%}")

    except FileNotFoundError:
        print("Error: File 'transactions.csv' not found")
    except Exception as e:
        print(f"Error: {str(e)}")

Loaded 148 samples with 12 categories

Training model...




Epoch 1/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m24s[0m 8s/step - accuracy: 0.0625 - loss: 2.4764 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 408ms/step - accuracy: 0.0849 - loss: 2.4841 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2000 - val_loss: 2.4825 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 37ms/step - accuracy: 0.1875 - loss: 2.5348 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2249 - loss: 2.4999 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2333 - val_loss: 2.4797 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 38ms/step - accuracy: 0.1875 - loss: 2.5495 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.2395 - loss: 2.4990 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2667 - val_loss: 2.4749 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 32ms/step - accuracy: 0.0938 - loss: 2.4546 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2564 - loss: 2.4617 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3333 - val_loss: 2.4676 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 5/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 39ms/step - accuracy: 0.3750 - loss: 2.3298 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.4159 - loss: 2.4110 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2667 - val_loss: 2.4556 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 6/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 33ms/step - accuracy: 0.4062 - loss: 2.3879 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4299 - loss: 2.4168 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2333 - val_loss: 2.4383 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 7/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 34ms/step - accuracy: 0.4062 - loss: 2.4847 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.4390 - loss: 2.4232 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3000 - val_loss: 2.4069 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 8/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 35ms/step - accuracy: 0.5312 - loss: 2.3378 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.4864 - loss: 2.3452 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2667 - val_loss: 2.3487 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 9/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 25ms/step - accuracy: 0.4062 - loss: 2.2061 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.4594 - loss: 2.2057 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2667 - val_loss: 2.2462 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 10/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 43ms/step - accuracy: 0.5312 - loss: 2.0265 - precision: 0.0000e+00 - recall: 0.0000e+00



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.5195 - loss: 2.0227 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.2667 - val_loss: 2.1361 - val_precision: 0.5000 - val_recall: 0.0333
Epoch 11/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 41ms/step - accuracy: 0.6250 - loss: 1.7452 - precision: 1.0000 - recall: 0.0625



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.5081 - loss: 1.7676 - precision: 0.9667 - recall: 0.0935 - val_accuracy: 0.2667 - val_loss: 2.0419 - val_precision: 0.2500 - val_recall: 0.0333
Epoch 12/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 45ms/step - accuracy: 0.5938 - loss: 1.5751 - precision: 1.0000 - recall: 0.2812



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.5242 - loss: 1.5876 - precision: 0.9247 - recall: 0.2199 - val_accuracy: 0.4000 - val_loss: 1.9095 - val_precision: 0.5714 - val_recall: 0.1333
Epoch 13/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 34ms/step - accuracy: 0.6562 - loss: 1.2345 - precision: 0.9231 - recall: 0.3750



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5901 - loss: 1.3195 - precision: 0.8834 - recall: 0.3288 - val_accuracy: 0.4667 - val_loss: 1.8005 - val_precision: 0.7778 - val_recall: 0.2333
Epoch 14/50
[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 43ms/step - accuracy: 0.5312 - loss: 1.2500 - precision: 0.8667 - recall: 0.4062



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5756 - loss: 1.1240 - precision: 0.9126 - recall: 0.4359 - val_accuracy: 0.5333 - val_loss: 1.6780 - val_precision: 0.7273 - val_recall: 0.2667
Epoch 15/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.7222 - loss: 0.9260 - precision: 0.9224 - recall: 0.4859 - val_accuracy: 0.5000 - val_loss: 1.7733 - val_precision: 0.6429 - val_recall: 0.3000
Epoch 16/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.7580 - loss: 0.8038 - precision: 0.9478 - recall: 0.5896 - val_accuracy: 0.5333 - val_loss: 1.7183 - val_precision: 0.6875 - val_recall: 0.3667
Epoch 17/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8697 - loss: 0.5600 - precision: 0.9395 - recall: 0.6800 - val_accuracy: 0.6333 - val_loss: 1.7360 - val_precision: 0.6316 - val_recall: 0.4000
Epoch 18/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━




Enhanced Test Predictions:

Original: Gaji bulan Desember dari kantor
Processed: gaji bulan desember dari kantor
Prediction: Income (Confidence: 99.70%)
Details:
- Income: 99.70%

Original: Bayar tagihan listrik PLN
Processed: bayar tagih listrik pln
Prediction: Utilities (Confidence: 71.33%)
Details:
- Debt & Loans: 14.22%
- Utilities: 71.33%

Original: Beli makan siang di warteg
Processed: beli makan siang di warteg
Prediction: Food & Dining (Confidence: 95.67%)
Details:
- Food & Dining: 95.67%

Original: Transfer ke BCA untuk angsuran KPR
Processed: transfer ke bca untuk angsur kpr
Prediction: Debt & Loans (Confidence: 75.65%)
Details:
- Debt & Loans: 75.65%
- Transportation: 17.62%

Original: Isi pulsa Telkomsel 100rb
Processed: isi pulsa telkomsel 100rb
Prediction: LOW_CONFIDENCE (Confidence: 48.16%)
⚠️ Needs manual review
Details:
- Debt & Loans: 5.20%
- Gifts & Donations: 8.84%
- Income: 8.26%
- Miscellaneous: 10.60%
- Savings & Investments: 7.11%
- Utilities: 48.16%

Original: