In [None]:
!pip install neattext
!pip install umap-learn

In [None]:
import seaborn as sns
import neattext.functions as nfx
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
import string
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import umap

In [None]:
df = pd.read_csv('/kaggle/input/comma-separated-transactions/global_sms_transactions.csv', dtype={'Card': str})

In [None]:
df.head()

In [None]:
# Lowercase conversion
columns_to_lowercase = ['Currency', 'Merchant', 'Category', 'Message', 'Transaction_Type']
for column in columns_to_lowercase:
    df[column] = df[column].str.lower()

In [None]:
df.head()

In [None]:
# Describe the dataframe to get statistical summaries of numerical columns
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count of transaction types
transaction_types_counts = df['Transaction_Type'].value_counts()

# Plot for Transaction Types
plt.figure(figsize=(10, 6))
sns.barplot(x=transaction_types_counts.index, y=transaction_types_counts.values)
plt.title('Transaction Types Distribution')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count of categories without duplication
category_counts = df['Category'].value_counts()

# Plot for Categories
plt.figure(figsize=(12, 8))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate labels to fit complex categories
plt.show()

In [None]:
transaction_type_counts = df['Transaction_Type'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(transaction_type_counts, labels=transaction_type_counts.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Transaction Type Distribution')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the counts of transactions for each currency
currency_counts = df['Currency'].value_counts()

# Create a bar plot for Currencies
plt.figure(figsize=(12, 8))
sns.barplot(x=currency_counts.index, y=currency_counts.values)
plt.title('Currency Distribution')
plt.xlabel('Currency')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Adjust rotation as needed to fit currency labels
plt.show()


In [None]:
#Adding features and labels

Xfeatures = df['Message']
ylabels = df[['Transaction_Type', 'Category']]

In [None]:
# Filter out specific categories
excluded_categories = ['deposit', 'transfer']
filtered_df = df[~df['Category'].isin(excluded_categories)]

# Use the entire filtered dataset for UMAP
Xfeatures_filtered = filtered_df['Message']
ylabels_filtered = filtered_df['Category']

In [None]:
# Vectorize the filtered Text Data with a limited vocabulary
count_vectorizer = CountVectorizer(max_features=1000)
X_vectorized_filtered = count_vectorizer.fit_transform(Xfeatures_filtered)
print("Vectorized shape:", X_vectorized_filtered.shape)

# Apply TruncatedSVD to reduce dimensions
svd = TruncatedSVD(n_components=50, random_state=7)
X_reduced_filtered = svd.fit_transform(X_vectorized_filtered)
print("SVD reduced shape:", X_reduced_filtered.shape)

# Apply UMAP on the reduced data
umap_model = umap.UMAP(n_components=2, random_state=7)
X_umap_filtered = umap_model.fit_transform(X_reduced_filtered)
print("UMAP shape:", X_umap_filtered.shape)

In [None]:
# Create a DataFrame for Visualization
umap_df_filtered = pd.DataFrame(X_umap_filtered, columns=['UMAP1', 'UMAP2'])
umap_df_filtered['Category'] = ylabels_filtered.values

print(umap_df_filtered.head())

In [None]:
# Define a distinct color palette
palette = sns.color_palette("tab20", len(umap_df_filtered['Category'].unique()))

# Plot UMAP for Category with distinct colors
plt.figure(figsize=(12, 8))
sns.scatterplot(x='UMAP1', y='UMAP2', hue='Category', data=umap_df_filtered, palette=palette)
plt.title('UMAP visualization for Category')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
plt.show()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=7)

In [None]:
#Initializing the processes
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),
                          ('lr_multi',MultiOutputClassifier(LogisticRegression()))])

In [None]:
pipe_lr.fit(x_train,y_train)

In [None]:
y_pred = pipe_lr.predict(x_test)

In [None]:
# Transaction_Type Report
print("Classification report for Transaction_Type:")
print(classification_report(y_test['Transaction_Type'], y_pred[:, 0]))

# Calculate and print accuracy for Transaction_Type
accuracy_transaction_type = accuracy_score(y_test['Transaction_Type'], y_pred[:, 0])
print(f"Accuracy for Transaction_Type: {accuracy_transaction_type:.4f}")

# Category Report
print("Classification report for Category:")
print(classification_report(y_test['Category'], y_pred[:, 1]))

# Calculate and print accuracy for Category
accuracy_category = accuracy_score(y_test['Category'], y_pred[:, 1])
print(f"Accuracy for Category: {accuracy_category:.4f}")


In [None]:
messages = [
    "VISA0610 17:12 PURCHASE 540 RUB EVO_PAPA GREEK BALANCE: 3314.02 RUB",
    "VISA0610 15:27 transfer 1200 rubles Balance: 12,034.56 rubles",
    "PAYMENT 1726.32 USD CARD*3970 DIESEL BALANCE 362.68 USD 11:49",
    "Replenishment, account RUB.  1000 RUB.  Alina A. Available 1507.15 RUB",
    "VISA0610 02:53 Purchase 160 USD BAR-231 Balance: 6776 USD",
    "MASTERCARD0405 19:30 Crediting 1000 USD ATM 13579246 Balance: 16775.00 USD",
    "Purchase, card *5744. 1300 RUB. SMOKE. Available 7292.5 RUB",
    "Purchase, card *5744. 1323 RUB. MK Delivery. Available 108.44 RUB",
    "VISA0610 01:10 Purchase 4456 RUB OZON Balance: 1753.64 RUB",
    "FASTPAYMENT RECEIVED 250 USD FROM JOHN DOE AT STARBUCKS BALANCE: 3089.75 USD", # abslutlly new transaction
    "TRANSFER INITIATED 1200 GBP TO ALICE'S BAKERY FOR CATERING SERVICES BALANCE: 5820.50 GBP", # abslutlly new transaction
]

# Iterate over each message, predict and print the result
for message in messages:
    predictions = pipe_lr.predict([message])  # Predict for each message
    predicted_transaction_type, predicted_category = predictions[0]  # Unpack predictions

    print("Message:", message)
    print("Predicted Transaction Type:", predicted_transaction_type)
    print("Predicted Category:", predicted_category)
    print()  # Print a newline for better readability between results


In [None]:
!pip install joblib

In [None]:
# Save the model to pickle 
import joblib

# Save the complete pipeline
joblib.dump(pipe_lr, 'categorizer_model.pkl')