# Type Classifier Evaluation

This script is used for the model evaluation for Type Classifier.

------


In [1]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Bernoulli Naive Bayes (Similar as  MultinomialNB), this classifier is suitable for discrete data. The difference between MultinomialNB and BernoulliNB is that while  MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolen features, which means in the case of text classification, word occurrence vectores (rather than word count vectors) may be more suitable to be used to train and use this classifier.
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

import matplotlib.pyplot as plt
# import seaborn as sns

------

# Using Single Line Input for Testing

------
## Using CountVectorizer

In [2]:
# -----Text Vectorizer Loading
cv = joblib.load("CV/type_CountVectorizer.joblib")

# -----Model Loading'
lr_clf = joblib.load("CV/lr_type_classifier.joblib")
rf_clf = joblib.load("CV/rf_type_classifier.joblib")
svm_clf = joblib.load("CV/svm_type_classifier.joblib")
mnb_clf = joblib.load("CV/mnb_type_classifier.joblib")

In [15]:
# -----Predict a signle Line

text1 = '30 items has been sold'
text2 = '30 items has been sold in the last 3 hours'
text3 = 'Item is selling fast, order now!'
text4 = 'Five other people are viewing this item.'
text5 = '22 sold'
text6 = '15 sold'

In [6]:
# mapping of the encoded dark pattern types.

cat_dic = {0:'Fake Activity', 1:'Fake Countdown', 2:'Fake High-demand', 3:'Fake Limited-time', 4:'Fake Low-stock'}

In [11]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text1:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text1)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text1)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text1)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text1)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake Activity
Prediction from Random Forest Model:
 Fake Activity
Prediction from SVM Model:
 Fake Activity
Prediction from Multinomial Naive Bayes Model:
 Fake Countdown


In [12]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text2:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text2)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text2)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text2)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text2)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake Activity
Prediction from Random Forest Model:
 Fake Activity
Prediction from SVM Model:
 Fake Activity
Prediction from Multinomial Naive Bayes Model:
 Fake Countdown


In [13]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text3:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text3)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text3)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text3)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text3)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake High-demand
Prediction from Random Forest Model:
 Fake High-demand
Prediction from SVM Model:
 Fake High-demand
Prediction from Multinomial Naive Bayes Model:
 Fake High-demand


In [14]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text4:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text4)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text4)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text4)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text4)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake Activity
Prediction from Random Forest Model:
 Fake Activity
Prediction from SVM Model:
 Fake Activity
Prediction from Multinomial Naive Bayes Model:
 Fake Activity


In [16]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text5:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text5)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text5)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text5)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text5)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake Activity
Prediction from Random Forest Model:
 Fake Activity
Prediction from SVM Model:
 Fake Activity
Prediction from Multinomial Naive Bayes Model:
 Fake Activity


In [17]:
# -----Make predictions on the single text line
# apply the pretrained model to the input text

# === Text6:
pred_lr = lr_clf.predict(cv.transform(pd.Series(text6)))
pred_rf = rf_clf.predict(cv.transform(pd.Series(text6)))
pred_svm = svm_clf.predict(cv.transform(pd.Series(text6)))
pred_mnb = mnb_clf.predict(cv.transform(pd.Series(text6)))

print('Prediction from Logistic Regression Model:\n\b', cat_dic[int(pred_lr)])
print('Prediction from Random Forest Model:\n\b', cat_dic[int(pred_rf)])
print('Prediction from SVM Model:\n\b', cat_dic[int(pred_svm)])
print('Prediction from Multinomial Naive Bayes Model:\n\b', cat_dic[int(pred_mnb)])

Prediction from Logistic Regression Model:
 Fake Activity
Prediction from Random Forest Model:
 Fake Activity
Prediction from SVM Model:
 Fake Activity
Prediction from Multinomial Naive Bayes Model:
 Fake Activity


## Using Testing Dataset  !!!!!!!!!!!!!!!!! TO BE CONTINUED

---
Import the testing dataset

In [2]:
data = pd.read_csv('test_data.csv')

In [3]:
data.head(5)

Unnamed: 0,content,Classification
0,Your price for this item is $ 89,1
1,Your price for this item is $ 79,1
2,Your price for this item is $ 55,1
3,Your price for this item is $ 49,1
4,Your price for this item is $ 21,1


---
`check the dataset information`

There are 3694 NOT NULL instances of content strings in the dataset.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3694 entries, 0 to 3693
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3694 non-null   object
 1   Classification  3694 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 57.8+ KB


In [5]:
# check the distribution of the target value --- classification.

print('Distribution of the tags:\n{}'.format(data['Classification'].value_counts()))

Distribution of the tags:
1    3344
0     350
Name: Classification, dtype: int64


In [6]:
# For later training the model, we should remove the duplicate input to reduce overfitting.

data = data.drop_duplicates(subset="content")

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3694 entries, 0 to 3693
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3694 non-null   object
 1   Classification  3694 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 86.6+ KB


In [7]:
print(data.head(5))

print('\nDistribution of the tags:\n{}'.format(data['Classification'].value_counts()))

                            content  Classification
0  Your price for this item is $ 89               1
1  Your price for this item is $ 79               1
2  Your price for this item is $ 55               1
3  Your price for this item is $ 49               1
4  Your price for this item is $ 21               1

Distribution of the tags:
1    3344
0     350
Name: Classification, dtype: int64


---
## Data Preparation

---
`Encode the target vales into integers` --- 'classification'

In [8]:
Y = data['Classification']
X = data['content']

In [9]:
encoder = LabelEncoder()
encoder.fit(Y)
y = encoder.transform(Y)

In [10]:
# check the mapping of encoding results (from 0 to 1 representing 'Dark', 'Not Dark')

integer_mapping = {label: encoding for encoding, label in enumerate(encoder.classes_)}
print(integer_mapping)

{0: 0, 1: 1}


In [11]:
# Check the frequency distribution of the pattern classification with pattern classification names.

(unique, counts) = np.unique(Y, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[   0  350]
 [   1 3344]]


In [12]:
# Check the frequency distribution of the encoded pattern classification with encoded integers.

(unique, counts) = np.unique(y, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[   0  350]
 [   1 3344]]


---
# Bernoulli Naive Bayes Classifier


### Version 1
------

In [13]:
# -----Text Vectorizer Loading
cv = joblib.load("V1/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V1/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V1.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V1.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  553]
 [   1 3141]]

Accuracy: 0.941797509474824
Precision: 0.6220614828209765
Confusion Matrix:
 [[ 344    6]
 [ 209 3135]]

Precison: 0.622
Recall: 0.983
F1 Score: 0.762


### Version 2
------

In [14]:
# -----Text Vectorizer Loading
cv = joblib.load("V2/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V2/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V2.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V2.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  855]
 [   1 2839]]

Accuracy: 0.8622089875473741
Precision: 0.4070175438596491
Confusion Matrix:
 [[ 348    2]
 [ 507 2837]]

Precison: 0.407
Recall: 0.994
F1 Score: 0.578


### Version 4
------

In [15]:
# -----Text Vectorizer Loading
cv = joblib.load("V4/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V4/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V4.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V4.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  490]
 [   1 3204]]

Accuracy: 0.9593936112615051
Precision: 0.7040816326530612
Confusion Matrix:
 [[ 345    5]
 [ 145 3199]]

Precison: 0.704
Recall: 0.986
F1 Score: 0.821


### Version 5
------

In [16]:
# -----Text Vectorizer Loading
cv = joblib.load("V5/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V5/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V5.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V5.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  492]
 [   1 3202]]

Accuracy: 0.9593936112615051
Precision: 0.7032520325203252
Confusion Matrix:
 [[ 346    4]
 [ 146 3198]]

Precison: 0.703
Recall: 0.989
F1 Score: 0.822


### Version 6
------

In [17]:
# -----Text Vectorizer Loading
cv = joblib.load("V6/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V6/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V6.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V6.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  527]
 [   1 3167]]

Accuracy: 0.9510016242555496
Precision: 0.6603415559772297
Confusion Matrix:
 [[ 348    2]
 [ 179 3165]]

Precison: 0.660
Recall: 0.994
F1 Score: 0.794


### Version 7
------

In [18]:
# -----Text Vectorizer Loading
cv = joblib.load("V7/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V7/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V7.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V7.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  509]
 [   1 3185]]

Accuracy: 0.9547915538711423
Precision: 0.6797642436149313
Confusion Matrix:
 [[ 346    4]
 [ 163 3181]]

Precison: 0.680
Recall: 0.989
F1 Score: 0.806


### Version 8
------

In [19]:
# -----Text Vectorizer Loading
cv = joblib.load("V8/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V8/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V8.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V8.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  458]
 [   1 3236]]

Accuracy: 0.9685977260422306
Precision: 0.7554585152838428
Confusion Matrix:
 [[ 346    4]
 [ 112 3232]]

Precison: 0.755
Recall: 0.989
F1 Score: 0.856


### Version 9 -- CV -- CountVectorizer
------

In [20]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/CV/CountVectorizer/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/CV/CountVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-CV-CountVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-CV-CountVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  378]
 [   1 3316]]

Accuracy: 0.9875473741201949
Precision: 0.9021164021164021
Confusion Matrix:
 [[ 341    9]
 [  37 3307]]

Precison: 0.902
Recall: 0.974
F1 Score: 0.937


### Version 9 -- CV -- TfidfVectorizer
------

In [21]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/CV/TfidfVectorizer/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/CV/TfidfVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-CV-TfidfVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-CV-TfidfVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  378]
 [   1 3316]]

Accuracy: 0.9875473741201949
Precision: 0.9021164021164021
Confusion Matrix:
 [[ 341    9]
 [  37 3307]]

Precison: 0.902
Recall: 0.974
F1 Score: 0.937


### Version 9 -- HO -- CountVectorizer
------

In [22]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/HO/CountVectorizer/presence_CountVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/HO/CountVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-HO-CountVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-HO-CountVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  407]
 [   1 3287]]

Accuracy: 0.9813210611802924
Precision: 0.8452088452088452
Confusion Matrix:
 [[ 344    6]
 [  63 3281]]

Precison: 0.845
Recall: 0.983
F1 Score: 0.909


### Version 9 -- HO -- TfidfVectorizer
------

In [23]:
# -----Text Vectorizer Loading
cv = joblib.load("V9/HO/TfidfVectorizer/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V9/HO/TfidfVectorizer/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V9-HO-TfidfVectorizer.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V9-HO-TfidfVectorizer.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  382]
 [   1 3312]]

Accuracy: 0.9870059556036817
Precision: 0.8952879581151832
Confusion Matrix:
 [[ 342    8]
 [  40 3304]]

Precison: 0.895
Recall: 0.977
F1 Score: 0.934


### Version 10 -- CV
------

In [24]:
# -----Text Vectorizer Loading
cv = joblib.load("V10/CV/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V10/CV/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-CV.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-CV.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  369]
 [   1 3325]]

Accuracy: 0.9818624796968056
Precision: 0.8834688346883469
Confusion Matrix:
 [[ 326   24]
 [  43 3301]]

Precison: 0.883
Recall: 0.931
F1 Score: 0.907


### Version 10 -- HO
------

In [25]:
# -----Text Vectorizer Loading
cv = joblib.load("V10/HO/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("V10/HO/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-HO.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-HO.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'iid': 'deprecated', 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0  362]
 [   1 3332]]

Accuracy: 0.9420682187330807
Precision: 0.6878453038674033
Confusion Matrix:
 [[ 249  101]
 [ 113 3231]]

Precison: 0.688
Recall: 0.711
F1 Score: 0.699
