In [None]:
import pandas as pd
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pickle
from pymongo import MongoClient
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support

In [None]:
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [None]:
df = read_mongo('NLP', 'Ledger')

In [None]:
df.head()

In [None]:
transaction = df['Transaction']

In [None]:
# Convert text to lower case:
transaction = transaction.apply(lambda X: X.lower())

In [None]:
# Remove Numbers
r = re.compile('[0-9]+')
transaction = transaction.apply(lambda X: re.sub(r, '', X))

In [None]:
transaction

In [None]:
# stop words
stopword = stopwords.words('english')

# WordNet Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lem(text):
    word_tokens = word_tokenize(text)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word, 'v') for word in word_tokens]
    return lemmatized_word

transaction = transaction.apply(lambda X: lem(X))
transaction

In [None]:
# removing stop words
transaction = transaction.apply(lambda X: [word for word in X if word not in stopword])
transaction

In [None]:
# vectorizing using TF-IDF
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)
features = tfidf.fit_transform(transaction)

In [None]:
# label encoding
le = preprocessing.LabelEncoder()
labels = df['Type']
le.fit(labels)

In [None]:
# label encoder classes
le.classes_

In [None]:
# encoding labels
labels = le.transform(labels)

In [None]:
# generating features
features = np.asarray(features.todense())

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=101)

In [None]:
# training logistic regression classifier
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
# training accuracy
clf.score(X_train, y_train)

In [None]:
# testing accuracy
clf.score(X_test, y_test)

In [None]:
# predicting test data
y_pred = clf.predict(X_test)

In [None]:
# generating confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# confusion matrix display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

In [None]:
# plotting confusion matrix
disp.plot(cmap='magma')

In [None]:
#precision, recall, f1 score
precision_recall_fscore_support(y_test, y_pred, average='macro')

In [None]:
# accuracy of each classes
cm.diagonal()/cm.sum(axis=1)

In [None]:
# saving models to hard disk

pickle.dump(le, open('encoder.sav', 'wb'))
pickle.dump(clf, open('model.sav', 'wb'))
data_target = features, labels
pickle.dump(data_target, open('data_target.sav', 'wb'))
pickle.dump(tfidf, open('tfidf.sav', 'wb'))
pickle.dump(identity_tokenizer, open('identity_tokenizer.sav', 'wb'))