In [1]:
import pandas as pd
import nltk
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv ("classifier2.csv",on_bad_lines='skip')
df
# df.head()

Unnamed: 0,information element,classe
0,personal data,data
1,additional data,data
2,why collecting account information,policy
3,where delivered messages are stored,data
4,how to deal with undelivered messages,process
...,...,...
245,other google services,data
246,installation tracking,process
247,promotion tracking,policy
248,synchronization,policy


In [21]:
# turn the classes and paragraphs into lists 

IEs = df['information element'].tolist()
classes = df['classe'].tolist()
IEs [1], classes [1]

('additional data', 'data')

In [22]:
# It is better to split the dataset only once and store the two datasets in two separate files
# If the samples (separate workouts) will be different then we can't
# compare between different models if you apply separate training

X1_train, X1_test, y_train, y_test = train_test_split(df["information element"],df["classe"],test_size=0.2, random_state=42)

Tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
# count_vectorizer = CountVectorizer()
X_train = Tfidf_vectorizer.fit_transform(X1_train)
X_test = Tfidf_vectorizer.transform(X1_test)

In [23]:
#training Naive Bayes

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [25]:
#test Naive Bayes

Y_naive_bayes = naive_bayes.predict(X_test)

In [26]:
print(classification_report(y_test, Y_naive_bayes))

              precision    recall  f1-score   support

        data       0.63      0.96      0.76        28
      policy       0.67      0.14      0.24        14
     process       0.50      0.25      0.33         8

    accuracy                           0.62        50
   macro avg       0.60      0.45      0.44        50
weighted avg       0.62      0.62      0.55        50



In [27]:
import matplotlib.pyplot as plt


confusion_matrix = confusion_matrix(y_test, Y_naive_bayes)
# cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
# cm_display.plot()
# plt.show()


TypeError: 'numpy.ndarray' object is not callable

In [12]:
# The prediction function
# For naive bayes, the prediction must go through the same pipeline

def predict(text_str):
    x_input = Tfidf_vectorizer.transform([text_str])
    print (naive_bayes.predict(x_input))
    return naive_bayes.predict_proba(x_input)[0]

In [13]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : process
#---------------------------------------------------------------------
predict("deal with forwarded media")

['process']


array([0.32619081, 0.21190217, 0.46190702])

In [14]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : policy
#---------------------------------------------------------------------
predict("cookies information")

['policy']


array([0.36558468, 0.50217019, 0.13224513])

In [15]:
#=====================================================================
# UNIT TEST 
#=====================================================================
# RESULTS : data
#---------------------------------------------------------------------
predict("payment information")

['data']


array([0.74587019, 0.12272533, 0.13140447])

In [16]:
import pickle
pickle.dump(naive_bayes, open('./naive_bayes_2.pkl', 'wb'))
pickle.dump(Tfidf_vectorizer, open('./count_vectorizer_2.pkl', 'wb'))

In [17]:
naive_bayes = pickle.load(open('naive_bayes_2.pkl', 'rb'))
naive_bayes

In [18]:
Tfidf_vectorizer = pickle.load(open('count_vectorizer_2.pkl', 'rb'))
Tfidf_vectorizer

In [19]:
#===========================================================================================
# Testing the prediction function using the reloaded objects (ML model + Count Vectorizer)
#===========================================================================================
# RESULTS : policy
#-------------------------------------------------------------------------------------------
from scipy.sparse import csr_matrix
str = ['using end to end encryption']
x_input = Tfidf_vectorizer.transform([word.lower() for word in str])
csr_matrix.toarray(x_input)
# print (csr_matrix.toarray(x_input)[0])
naive_bayes.predict(csr_matrix.toarray(x_input))[0]

'policy'