# Bernoulli Naive Bayes Models Evaluation

This script is used for the model evaluation for Bernoulli Naive Bayes.

The dataset is made up of web scraping data from 20 website pages containing dark patterns.

----


In [1]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer

# Bernoulli Naive Bayes (Similar as  MultinomialNB), this classifier is suitable for discrete data. The difference between MultinomialNB and BernoulliNB is that while  MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolen features, which means in the case of text classification, word occurrence vectores (rather than word count vectors) may be more suitable to be used to train and use this classifier.
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

import matplotlib.pyplot as plt
# import seaborn as sns

## Data Exploration

---
Import the testing dataset

In [2]:
data = pd.read_csv('test_data.csv')

In [3]:
data.head(5)

Unnamed: 0,content,Classification
0,Your price for this item is $ 89,1
1,Your price for this item is $ 79,1
2,Your price for this item is $ 55,1
3,Your price for this item is $ 49,1
4,Your price for this item is $ 21,1


---
`check the dataset information`

There are 3694 NOT NULL instances of content strings in the dataset.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3694 entries, 0 to 3693
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3694 non-null   object
 1   Classification  3694 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 57.8+ KB


In [5]:
# check the distribution of the target value --- classification.

print('Distribution of the tags:\n{}'.format(data['Classification'].value_counts()))

Distribution of the tags:
1    3344
0     350
Name: Classification, dtype: int64


In [6]:
# For later training the model, we should remove the duplicate input to reduce overfitting.

data = data.drop_duplicates(subset="content")

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3694 entries, 0 to 3693
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3694 non-null   object
 1   Classification  3694 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 86.6+ KB


In [7]:
print(data.head(5))

print('\nDistribution of the tags:\n{}'.format(data['Classification'].value_counts()))

                            content  Classification
0  Your price for this item is $ 89               1
1  Your price for this item is $ 79               1
2  Your price for this item is $ 55               1
3  Your price for this item is $ 49               1
4  Your price for this item is $ 21               1

Distribution of the tags:
1    3344
0     350
Name: Classification, dtype: int64


---
## Data Preparation

---
`Encode the target vales into integers` --- 'classification'

In [8]:
Y = data['Classification']
X = data['content']

In [9]:
encoder = LabelEncoder()
encoder.fit(Y)
y = encoder.transform(Y)

In [10]:
# check the mapping of encoding results (from 0 to 1 representing 'Dark', 'Not Dark')

integer_mapping = {label: encoding for encoding, label in enumerate(encoder.classes_)}
print(integer_mapping)

{0: 0, 1: 1}


In [11]:
# Check the frequency distribution of the pattern classification with pattern classification names.

(unique, counts) = np.unique(Y, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[   0  350]
 [   1 3344]]


In [12]:
# Check the frequency distribution of the encoded pattern classification with encoded integers.

(unique, counts) = np.unique(y, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[   0  350]
 [   1 3344]]


---
# Bernoulli Naive Bayes Classifier


### Version 1
------

In [13]:
# -----Text Vectorizer Loading
cv = joblib.load("V1/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V1/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.40.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V1.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V1.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  532]
 [   1 3162]]

Accuracy: 0.9474824038982134
Precision: 0.6466165413533834
Confusion Matrix:
 [[ 344    6]
 [ 188 3156]]

Precison: 0.647
Recall: 0.983
F1 Score: 0.780


### Version 2
------

In [14]:
# -----Text Vectorizer Loading
cv = joblib.load("V2/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V2/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V2.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V2.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  744]
 [   1 2950]]

Accuracy: 0.8922577152138603
Precision: 0.46774193548387094
Confusion Matrix:
 [[ 348    2]
 [ 396 2948]]

Precison: 0.468
Recall: 0.994
F1 Score: 0.636


### Version 4
------

In [15]:
# -----Text Vectorizer Loading
cv = joblib.load("V4/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V4/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V4.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V4.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  462]
 [   1 3232]]

Accuracy: 0.9664320519761775
Precision: 0.7445887445887446
Confusion Matrix:
 [[ 344    6]
 [ 118 3226]]

Precison: 0.745
Recall: 0.983
F1 Score: 0.847


### Version 5
------

In [16]:
# -----Text Vectorizer Loading
cv = joblib.load("V5/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V5/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V5.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V5.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  452]
 [   1 3242]]

Accuracy: 0.9702219815917704
Precision: 0.7654867256637168
Confusion Matrix:
 [[ 346    4]
 [ 106 3238]]

Precison: 0.765
Recall: 0.989
F1 Score: 0.863


### Version 6
------

In [17]:
# -----Text Vectorizer Loading
cv = joblib.load("V6/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V6/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V6.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V6.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  496]
 [   1 3198]]

Accuracy: 0.9588521927449919
Precision: 0.6995967741935484
Confusion Matrix:
 [[ 347    3]
 [ 149 3195]]

Precison: 0.700
Recall: 0.991
F1 Score: 0.820


### Version 7
------

In [18]:
# -----Text Vectorizer Loading
cv = joblib.load("V7/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V7/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V7.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V7.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  480]
 [   1 3214]]

Accuracy: 0.9604764482945317
Precision: 0.7125
Confusion Matrix:
 [[ 342    8]
 [ 138 3206]]

Precison: 0.713
Recall: 0.977
F1 Score: 0.824


### Version 8
------

In [19]:
# -----Text Vectorizer Loading
cv = joblib.load("V8/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V8/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V8.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V8.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  430]
 [   1 3264]]

Accuracy: 0.9740119112073633
Precision: 0.7953488372093023
Confusion Matrix:
 [[ 342    8]
 [  88 3256]]

Precison: 0.795
Recall: 0.977
F1 Score: 0.877


### Version 9 -- CV -- CountVectorizer
------

In [20]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/CV/CountVectorizer/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/CV/CountVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-CV-CountVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-CV-CountVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  371]
 [   1 3323]]

Accuracy: 0.9883595018949648
Precision: 0.9137466307277629
Confusion Matrix:
 [[ 339   11]
 [  32 3312]]

Precison: 0.914
Recall: 0.969
F1 Score: 0.940


### Version 9 -- CV -- TfidfVectorizer
------

In [21]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/CV/TfidfVectorizer/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/CV/TfidfVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-CV-TfidfVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-CV-TfidfVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  371]
 [   1 3323]]

Accuracy: 0.9883595018949648
Precision: 0.9137466307277629
Confusion Matrix:
 [[ 339   11]
 [  32 3312]]

Precison: 0.914
Recall: 0.969
F1 Score: 0.940


### Version 9 -- HO -- CountVectorizer
------

In [22]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/HO/CountVectorizer/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/HO/CountVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-HO-CountVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-HO-CountVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  397]
 [   1 3297]]

Accuracy: 0.9834867352463454
Precision: 0.8639798488664987
Confusion Matrix:
 [[ 343    7]
 [  54 3290]]

Precison: 0.864
Recall: 0.980
F1 Score: 0.918


### Version 9 -- HO -- TfidfVectorizer
------

In [23]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/HO/TfidfVectorizer/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/HO/TfidfVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-HO-TfidfVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-HO-TfidfVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  371]
 [   1 3323]]

Accuracy: 0.986735246345425
Precision: 0.9056603773584906
Confusion Matrix:
 [[ 336   14]
 [  35 3309]]

Precison: 0.906
Recall: 0.960
F1 Score: 0.932


### Version 10 -- CV
------

In [24]:
# -----Text Vectorizer Loading
cv = joblib.load("V10/CV/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V10/CV/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-CV.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-CV.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  362]
 [   1 3332]]

Accuracy: 0.981591770438549
Precision: 0.8895027624309392
Confusion Matrix:
 [[ 322   28]
 [  40 3304]]

Precison: 0.890
Recall: 0.920
F1 Score: 0.904


### Version 10 -- HO
------

In [25]:
# -----Text Vectorizer Loading
cv = joblib.load("V10/HO/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V10/HO/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
# pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.4
pred_vec = (clf.predict_proba(cv.transform(data['content']))[ : , 1] >= 0.4).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-HO.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-HO.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  346]
 [   1 3348]]

Accuracy: 0.9463995668651868
Precision: 0.7196531791907514
Confusion Matrix:
 [[ 249  101]
 [  97 3247]]

Precison: 0.720
Recall: 0.711
F1 Score: 0.716
