# Confirmshaming Classifier Evaluation

This script is used for the model evaluation for Confirmshaming Classifier.

------


In [1]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Bernoulli Naive Bayes (Similar as  MultinomialNB), this classifier is suitable for discrete data. The difference between MultinomialNB and BernoulliNB is that while  MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolen features, which means in the case of text classification, word occurrence vectores (rather than word count vectors) may be more suitable to be used to train and use this classifier.
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

import matplotlib.pyplot as plt
# import seaborn as sns

## Using Testing Dataset "new_confirm.csv"

---
Import the testing dataset

In [2]:
data = pd.read_csv('new_confirm.csv')

data.head(5)

Unnamed: 0,content,Classification
0,YES! YOU HAD ME AT FREE,Not_Dark
1,START MY FREE TRIAL NOW,Not_Dark
2,See FREE Summary,Not_Dark
3,Continue —>,Not_Dark
4,UNLOCK THE BEST CARS NOBODY BUYS,Not_Dark


---
`check the dataset information`

There are 3694 NOT NULL instances of content strings in the dataset.

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         208 non-null    object
 1   Classification  208 non-null    object
dtypes: object(2)
memory usage: 3.4+ KB


In [4]:
# check the distribution of the target value --- classification.

print('Distribution of the tags:\n{}'.format(data['Classification'].value_counts()))

Distribution of the tags:
Dark        105
Not_Dark    103
Name: Classification, dtype: int64


---
## Data Preparation

---
`Encode the target vales into integers` --- 'classification'

In [5]:
data['Classification'].replace({"Dark": 0, "Not_Dark": 1}, inplace = True)

data.head(5)

Unnamed: 0,content,Classification
0,YES! YOU HAD ME AT FREE,1
1,START MY FREE TRIAL NOW,1
2,See FREE Summary,1
3,Continue —>,1
4,UNLOCK THE BEST CARS NOBODY BUYS,1


In [6]:
# check the distribution of the target value --- classification.

print('Distribution of the tags:\n{}'.format(data['Classification'].value_counts()))

Distribution of the tags:
0    105
1    103
Name: Classification, dtype: int64


---
# Bernoulli Naive Bayes Classifier


-----
### Duplicate Training Dataset

In [7]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-bnb.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-bnb.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  69]
 [  1 139]]

Accuracy: 0.7307692307692307
Precision: 0.855072463768116
Confusion Matrix:
 [[59 46]
 [10 93]]

Precison: 0.855
Recall: 0.562
F1 Score: 0.678


-----
### SMOTE Training Dataset

In [15]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-bnb.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-bnb.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__alpha': 1.0, 'estimator__binarize': 0.0, 'estimator__class_prior': None, 'estimator__fit_prior': True, 'estimator': BernoulliNB(), 'n_jobs': -1, 'param_grid': {'alpha': [0, 1], 'fit_prior': [True, False]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  57]
 [  1 151]]

Accuracy: 0.7307692307692307
Precision: 0.9298245614035088
Confusion Matrix:
 [[53 52]
 [ 4 99]]

Precison: 0.930
Recall: 0.505
F1 Score: 0.654


---
# Logistic Regression Classifier


-----
### Duplicate Training Data

In [9]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/lr_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-lr.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-lr.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(), 'n_jobs': -1, 'param_grid': {'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'newton-cg', 'sag']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  97]
 [  1 111]]

Accuracy: 0.9519230769230769
Precision: 0.9896907216494846
Confusion Matrix:
 [[ 96   9]
 [  1 102]]

Precison: 0.990
Recall: 0.914
F1 Score: 0.950


-----
### SMOTE Training Data

In [10]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/lr_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-lr.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-lr.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(), 'n_jobs': -1, 'param_grid': {'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'newton-cg', 'sag']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  93]
 [  1 115]]

Accuracy: 0.9423076923076923
Precision: 1.0
Confusion Matrix:
 [[ 93  12]
 [  0 103]]

Precison: 1.000
Recall: 0.886
F1 Score: 0.939


---
# Support Vector Machine Classifier


--------
### Duplicate Training Dataset

In [11]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/svm_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-svm.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-svm.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': True, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__loss': 'squared_hinge', 'estimator__max_iter': 1000, 'estimator__multi_class': 'ovr', 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator': LinearSVC(), 'n_jobs': -1, 'param_grid': {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  87]
 [  1 121]]

Accuracy: 0.9134615384615384
Precision: 1.0
Confusion Matrix:
 [[ 87  18]
 [  0 103]]

Precison: 1.000
Recall: 0.829
F1 Score: 0.906


--------
### SMOTE Training Dataset

In [12]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/svm_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-svm.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-svm.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': True, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__loss': 'squared_hinge', 'estimator__max_iter': 1000, 'estimator__multi_class': 'ovr', 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator': LinearSVC(), 'n_jobs': -1, 'param_grid': {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[  0  85]
 [  1 123]]

Accuracy: 0.9038461538461539
Precision: 1.0
Confusion Matrix:
 [[ 85  20]
 [  0 103]]

Precison: 1.000
Recall: 0.810
F1 Score: 0.895


---
# Random Forest Classifier


-------
### Duplicate Training Data

In [13]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/rf_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-rf.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-rf.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__bootstrap': True, 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': 'auto', 'estimator__max_leaf_nodes': None, 'estimator__max_samples': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__n_estimators': 100, 'estimator__n_jobs': None, 'estimator__oob_score': False, 'estimator__random_state': None, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': RandomForestClassifier(), 'n_jobs': -1, 'param_grid': {'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50, None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [100, 200, 300]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scor

-------
### SMOTE Training Data

In [14]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/rf_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-rf.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-rf.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))

cm = metrics.confusion_matrix(data['Classification'], pred_vec)
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__bootstrap': True, 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': 'auto', 'estimator__max_leaf_nodes': None, 'estimator__max_samples': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__n_estimators': 100, 'estimator__n_jobs': None, 'estimator__oob_score': False, 'estimator__random_state': None, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': RandomForestClassifier(), 'n_jobs': -1, 'param_grid': {'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50, None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [100, 200, 300]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scor