# Confirmshaming Classifier Evaluation

This script is used for the model evaluation for Confirmshaming Classifier.

------


In [1]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Bernoulli Naive Bayes (Similar as  MultinomialNB), this classifier is suitable for discrete data. The difference between MultinomialNB and BernoulliNB is that while  MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolen features, which means in the case of text classification, word occurrence vectores (rather than word count vectors) may be more suitable to be used to train and use this classifier.
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

import matplotlib.pyplot as plt
# import seaborn as sns

## Using Testing Dataset "old_normal.csv"

---
Import the testing dataset

In [2]:
data = pd.read_csv('old_normal.csv')

data.head(5)

Unnamed: 0,content,Classification
0,Zip-through cadet collar with chin guard,1
1,Zip Up Belted PU Moto Jacket,1
2,Youth Core Performance Soft Shell,1
3,Your Watchlist,1
4,Your Video Purchases & Rentals,1


---
`check the dataset information`

There are 3024 NOT NULL instances of content strings in the dataset.

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3024 entries, 0 to 3023
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3024 non-null   object
 1   Classification  3024 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.4+ KB


In [4]:
# check the distribution of the target value --- classification.

print('Distribution of the tags:\n{}'.format(data['Classification'].value_counts()))

Distribution of the tags:
1    3024
Name: Classification, dtype: int64


---
# Bernoulli Naive Bayes Classifier


-----
### Duplicate Training Dataset

In [5]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-bnb.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-bnb.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'alpha': 0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

The distribution of the prediction: 
[[   0   32]
 [   1 2992]]

Accuracy: 0.9894179894179894
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [  32 2992]]


-----
### SMOTE Training Dataset

In [6]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/bnb_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-bnb.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-bnb.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'alpha': 0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

The distribution of the prediction: 
[[   0    6]
 [   1 3018]]

Accuracy: 0.998015873015873
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   6 3018]]


---
# Logistic Regression Classifier


-----
### Duplicate Training Data

In [7]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/lr_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-lr.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-lr.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(), 'n_jobs': -1, 'param_grid': {'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'newton-cg', 'sag']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0    6]
 [   1 3018]]

Accuracy: 0.998015873015873
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   6 3018]]


-----
### SMOTE Training Data

In [8]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/lr_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-lr.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-lr.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))



The hyperparameters of the model:
{'cv': 5, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(), 'n_jobs': -1, 'param_grid': {'penalty': ['l1', 'l2'], 'solver': ['lbfgs', 'newton-cg', 'sag']}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 1}

The distribution of the prediction: 
[[   0    3]
 [   1 3021]]

Accuracy: 0.9990079365079365
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   3 3021]]


---
# Support Vector Machine Classifier


--------
### Duplicate Training Dataset

In [9]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/svm_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-svm.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-svm.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))



The hyperparameters of the model:
{'C': 0.1, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}

The distribution of the prediction: 
[[   0    8]
 [   1 3016]]

Accuracy: 0.9973544973544973
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   8 3016]]


--------
### SMOTE Training Dataset

In [10]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/svm_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-svm.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-svm.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'C': 0.1, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}

The distribution of the prediction: 
[[   0    5]
 [   1 3019]]

Accuracy: 0.9983465608465608
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   5 3019]]


---
# Random Forest Classifier


-------
### Duplicate Training Data

In [11]:
# -----Text Vectorizer Loading
cv = joblib.load("Duplicate/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("Duplicate/rf_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/duplicate-rf.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/duplicate-rf.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

The distribution of the prediction: 
[[   0    3]
 [   1 3021]]

Accuracy: 0.9990079365079365
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   3 3021]]


-------
### SMOTE Training Data

In [12]:
# -----Text Vectorizer Loading
cv = joblib.load("SMOTE/presence_TfidfVectorizer.joblib")

# -----Model Loading'
clf = joblib.load("SMOTE/rf_presence_classifier.joblib")

# -----Print the model parameters
print("The hyperparameters of the model:\n{}\n".format(clf.get_params()))


# -----Make predictions on the testing dataset
# apply the pretrained model to the new content data
pred_vec = clf.predict(cv.transform(data['content'].str.lower()))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

data['prediction'] = pred_vec.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/smote-rf.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/smote-rf.csv', index = False, header = True)

# ----print the distribution of the prediction
(unique, counts) = np.unique(pred_vec, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The distribution of the prediction: \n{}\n".format(frequencies))

# ----Overview of the prediction results
print("Accuracy:", metrics.accuracy_score(data['Classification'], pred_vec))
print("Precision:", metrics.precision_score(data['Classification'],pred_vec, pos_label=0))
print("Confusion Matrix:\n", metrics.confusion_matrix(data['Classification'], pred_vec))


The hyperparameters of the model:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 50, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

The distribution of the prediction: 
[[   0    2]
 [   1 3022]]

Accuracy: 0.9993386243386243
Precision: 0.0
Confusion Matrix:
 [[   0    0]
 [   2 3022]]
