# Step 1: Install Required Libraries

In [1]:
!pip install tensorflow
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D




2024-05-23 07:25:54.881652: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Step 2: Load the Dataset

In [2]:
df = pd.read_excel('Spending_Pattern_Dataset.xlsx')

# Step 3: Data Preprocessing

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Processed_Description'] = df['Merchant_Name'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nasirhussain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nasirhussain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Step 4: Encode Labels

In [4]:
label_encoder = LabelEncoder()
df['MCC_Encoded'] = label_encoder.fit_transform(df['MCC'])


# Step 5: Train-Test Split

In [5]:
X = df['Processed_Description']
y = df['MCC_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 6: Tokenization and Padding

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = max([len(x) for x in X_train_sequences])
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_sequences, maxlen=max_length, padding='post')


# Step 7: Model Development and Training

In [7]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_length))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0385 - loss: 2.8907 - val_accuracy: 0.1429 - val_loss: 2.8902
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.1923 - loss: 2.8878 - val_accuracy: 0.1429 - val_loss: 2.8906
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0769 - loss: 2.8876 - val_accuracy: 0.1429 - val_loss: 2.8910
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.1923 - loss: 2.8846 - val_accuracy: 0.1429 - val_loss: 2.8914
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1538 - loss: 2.8825 - val_accuracy: 0.1429 - val_loss: 2.8918
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1923 - loss: 2.8805 - val_accuracy: 0.1429 - val_loss: 2.8922
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

# Step 8: Model Evaluation

In [8]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_length))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0769 - loss: 2.8899 - val_accuracy: 0.0000e+00 - val_loss: 2.8894
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1538 - loss: 2.8875 - val_accuracy: 0.1429 - val_loss: 2.8897
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.1923 - loss: 2.8859 - val_accuracy: 0.1429 - val_loss: 2.8902
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.1923 - loss: 2.8824 - val_accuracy: 0.1429 - val_loss: 2.8908
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.1923 - loss: 2.8796 - val_accuracy: 0.1429 - val_loss: 2.8914
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.2308 - loss: 2.8757 - val_accuracy: 0.1429 - val_loss: 2.8919
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

To run the provided code, follow these instructions:

Install Required Libraries: Ensure you have the necessary libraries installed. You can install them using pip if you haven't already:

pip install pandas numpy nltk scikit-learn tensorflow

Download NLTK Data: The code uses NLTK for natural language processing tasks. If you haven't downloaded NLTK data before, you'll need to do so. Run Python and execute the following:


import nltk
nltk.download('stopwords')
nltk.download('wordnet')

Download the Dataset: Make sure you have the dataset file Spending_Pattern_Dataset.xlsx in the same directory as your code. If it's located elsewhere, provide the correct path to the dataset file in the code.

Run the Code: Execute the code in a Python environment. You can use any Python IDE, text editor, or Jupyter Notebook. Copy the entire code provided and paste it into your Python environment.

Review Results: After running the code, review the output. It should include information about the preprocessing steps, model development, training progress, and evaluation metrics such as accuracy and classification report.

Interpretation and Analysis: Interpret the results to assess the effectiveness of the model in categorizing transaction descriptions. Pay attention to the accuracy score and other evaluation metrics as well as any insights gained from the analysis.

By following these instructions, you should be able to run the provided code successfully and evaluate the NLP model for categorizing transaction descriptions. If you encounter any issues or have questions, feel free to ask for assistance!






