In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics

#!pip install h5py pyyaml

In [2]:
data = pd.read_csv('test_data.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3694 entries, 0 to 3693
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   content         3694 non-null   object
 1   Classification  3694 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 57.8+ KB


In [3]:
data.head(5)

Unnamed: 0,content,Classification
0,Your price for this item is $ 89,1
1,Your price for this item is $ 79,1
2,Your price for this item is $ 55,1
3,Your price for this item is $ 49,1
4,Your price for this item is $ 21,1


-------
## Data Preparation

In [4]:
X = data['content'].values
Y = data['Classification'].values

In [5]:
encoder = LabelEncoder()
encoder.fit(Y)
y = encoder.transform(Y)

# check the mapping of encoding results (from 0 to 1 representing 'Dark', 'Not Dark')

integer_mapping = {label: encoding for encoding, label in enumerate(encoder.classes_)}
print("Label encoding mapping: {}\n".format(integer_mapping))

(unique, counts) = np.unique(Y, return_counts=True)
frequencies_y_label = np.asarray((unique, counts)).T

print('The frequency distribution of testing y labels:\n',frequencies_y_label)

(unique, counts) = np.unique(y, return_counts=True)
frequencies_y_encode_label = np.asarray((unique, counts)).T

print('The frequency distribution of testing encoded y labels:\n',frequencies_y_encode_label)

Label encoding mapping: {0: 0, 1: 1}

The frequency distribution of testing y labels:
 [[   0  350]
 [   1 3344]]
The frequency distribution of testing encoded y labels:
 [[   0  350]
 [   1 3344]]


-----
# Test the CNN Model

-----
## Embedding on the Fly --- Model 1

In [9]:
# ---- Load the vectorizer
cv = joblib.load("V10/EmbOTF_HO/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/EmbOTF_HO/CNN_model1.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(data['content'])
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
pred = clf.predict_classes(pad_content)
pred_list = pred.flatten()

data['prediction'] = pred_list.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-EmbOTF-model1.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-EmbOTF-model1.csv', index = False, header = True)

# ---- Print out the prediction distribution
(unique, counts) = np.unique(pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# ---- Evaluation metrics
cm = metrics.confusion_matrix(data['Classification'], pred, labels=[0,1])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)

print('Confusion Matrix of the prediction results:\n', cm)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

Model Summary: 

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 20)            100000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 19, 64)            2624      
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 104,737
Trainable params: 104,737
Non-trainable params: 0
_______________________________



-----
## Embedding on the Fly --- Model 2

In [10]:
# ---- Load the vectorizer
cv = joblib.load("V10/EmbOTF_HO/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/EmbOTF_HO/CNN_model2.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(data['content'])
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
pred = clf.predict_classes(pad_content)
pred_list = pred.flatten()

data['prediction'] = pred_list.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-EmbOTF-model2.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-EmbOTF-model2.csv', index = False, header = True)

# ---- Print out the prediction distribution
(unique, counts) = np.unique(pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# ---- Evaluation metrics
cm = metrics.confusion_matrix(data['Classification'], pred, labels=[0,1])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)

print('Confusion Matrix of the prediction results:\n', cm)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

Model Summary: 

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20, 20)            100000    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 19, 32)            1312      
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)                1056      
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 33        
Total params: 102,401
Trainable params: 102,401
Non-trainable params: 0
_______________________________



-----
## Embedding on the Fly --- Model 3

In [11]:
# ---- Load the vectorizer
cv = joblib.load("V10/EmbOTF_HO/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/EmbOTF_HO/CNN_model3.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(data['content'])
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
pred = clf.predict_classes(pad_content)
pred_list = pred.flatten()

data['prediction'] = pred_list.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-EmbOTF-model3.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-EmbOTF-model3.csv', index = False, header = True)

# ---- Print out the prediction distribution
(unique, counts) = np.unique(pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# ---- Evaluation metrics
cm = metrics.confusion_matrix(data['Classification'], pred, labels=[0,1])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)

print('Confusion Matrix of the prediction results:\n', cm)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

Model Summary: 

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 20, 20)            100000    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 20, 32)            672       
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 32)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 32)                1056      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total params: 101,761
Trainable params: 101,761
Non-trainable params: 0
_______________________________



-----
## Pre-trained Embedding --- Cased

In [20]:
# ---- Load the vectorizer
cv = joblib.load("V10/PreEmb_HO/Cased/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/PreEmb_HO/Cased/model_preEmb_CNN1.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(data['content'])
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
prediction = clf.predict(pad_content)
pred = (prediction>0.5).astype('int32')
pred_list = pred.flatten()

data['prediction'] = pred_list.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-PreEmb-Cased.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-PreEmb-Cased.csv', index = False, header = True)

# ---- Print out the prediction distribution
(unique, counts) = np.unique(pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# ---- Evaluation metrics
cm = metrics.confusion_matrix(data['Classification'], pred, labels=[0,1])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)

print('Confusion Matrix of the prediction results:\n', cm)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

Model Summary: 

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 300)         1706100   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
dense_10 (Dense)             (None, None, 32)          2080      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)   

-----
## Pre-trained Embedding --- Uncased

In [21]:
# ---- Load the vectorizer
cv = joblib.load("V10/PreEmb_HO/Uncased/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/PreEmb_HO/Uncased/model_preEmb_CNN1.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(data['content'].str.lower())
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
prediction = clf.predict(pad_content)
pred = (prediction>0.5).astype('int32')
pred_list = pred.flatten()

data['prediction'] = pred_list.tolist()

# ----dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['prediction']==0]

dark.to_csv('DP/V10-PreEmb-Uncased.csv', index = False, header = True)

# ----misclassification dataframe
mis = data.loc[data['Classification'] != data['prediction']]

mis.to_csv('Misclassification/V10-PreEmb-Uncased.csv', index = False, header = True)

# ---- Print out the prediction distribution
(unique, counts) = np.unique(pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# ---- Evaluation metrics
cm = metrics.confusion_matrix(data['Classification'], pred, labels=[0,1])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
f1 = 2*precision*recall/(precision+recall)

print('Confusion Matrix of the prediction results:\n', cm)
print("\nPrecison: {0:.3f}".format(precision))
print("Recall: {0:.3f}".format(recall))
print("F1 Score: {0:.3f}".format(f1))

Model Summary: 

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         1706100   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 64)          0         
_________________________________________________________________
dense (Dense)                (None, None, 32)          2080      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)     

-----
# DP Check

In [45]:
# ---- Load the vectorizer
cv = joblib.load("V10/PreEmb_HO/Uncased/Presence_Tokenizer.joblib")

# ---- Load the model
clf = keras.models.load_model('V10/PreEmb_HO/Uncased/model_preEmb_CNN1.h5')

print('Model Summary: \n')
print(clf.summary())

# ---- Text Vectorization
# Vectorization
content = cv.texts_to_sequences(pd.Series('ends soon'))
# Padding
maxlen = 20
pad_content = pad_sequences(content, padding='post', maxlen=maxlen)

# ---- Make Prediction
prediction = clf.predict(pad_content)
pred = (prediction<0.5).astype('int32')
pred_list = pred.flatten()

pred_list

Model Summary: 

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         1706100   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 64)          0         
_________________________________________________________________
dense (Dense)                (None, None, 32)          2080      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)     

array([1], dtype=int32)