In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report



In [3]:
# Read the CSV with all the transactions
base_dir = os.path.dirname(os.path.abspath('__file__'))
csv_path = os.path.join(base_dir, "./processed/combined_transactions.csv")

df = pd.read_csv(csv_path).sample(frac=1, random_state=42).reset_index(drop=True)

# Feature engineering
# Here are my columns: Transaction Date,Posted Date,Card No.,Description,Category,Debit,Credit,isFraud
# For my features, I will be using description and payment to predict the category of a transaction
df.drop(columns=['Transaction Date', 'Posted Date', 'Card No.', 'isFraud'], inplace=True)
df['Payment'] = df['Debit'] + df['Credit']
df.drop(columns=['Debit', 'Credit'], inplace=True)
payment_column = df.pop('Payment')
df.insert(1, 'Payment', payment_column)

# print(df['Category'].unique())
df.head()

Unnamed: 0,Description,Payment,Category
0,Maya Mobile,18.79,Phone/Cable
1,LA MAGDALENA DE PROUST,5.94,Dining
2,BP,75.43,Gas Station
3,UBR* PENDING.UBER.COM,7.47,Other Travel
4,LIDL BRAVO MURILLO,2.83,Merchandise


In [4]:

X = df['Description']
y = df['Category']
# print(len(X))

# If I do a 30-70 split the accuracy improves a bit, makes it more overfit or underfit?
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

# Pipelines que vectorizan las descripciones y luego te utilizan modelos de naive bayes para predecir
pipeMNB = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# Entrenando los modelos
pipeMNB.fit(X_train, y_train)
pipeCNB.fit(X_train, y_train)
pipeSVC.fit(X_train, y_train)

# Probando las predicciones  
predictMNB = pipeMNB.predict(X_test)
predictCNB = pipeCNB.predict(X_test)
predictSVC = pipeSVC.predict(X_test)

# Saca el accuracy de cada modelo de naive bayes
print("Accuracy Scores:")
print(f"MultinomialNB: {100 * accuracy_score(y_test, predictMNB):.2f}%")
print(f"ComplementNB: {100 * accuracy_score(y_test, predictCNB):.2f}%")
print(f"LinearSVC: {100 * accuracy_score(y_test, predictSVC):.2f}%")


Accuracy Scores:
MultinomialNB: 61.98%
ComplementNB: 75.52%
LinearSVC: 82.81%
