In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
!pip install h5py pyyaml



In [2]:
data = pd.read_csv('enriched_data.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Pattern String  2938 non-null   object
 1   classification  2938 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 46.0+ KB


In [3]:
data.head(5)

Unnamed: 0,Pattern String,classification
0,FREE SHIPPING ON ORDERS OVER $100!,0
1,SOME EXCLUSIONS APPLY - LEARN MORE,1
2,HAVE A QUESTION? - CONTACT US,1
3,WELCOME TO 034MOTORSPORT!,1
4,SHOP AUDISHOP VOLKSWAGENPERFORMANCE SOFTWARE03...,1
5,SEARCH,1
6,HOME,1
7,/,1
8,"ADJUSTABLE SOLID REAR SWAY BAR, 8J/8P AUDI TT/...",1
9,MORE VIEWS,1


In [4]:
data['classification'].replace({0:'Dark',1:'Not_Dark'}, inplace = True)

data.head(5)

Unnamed: 0,Pattern String,classification
0,FREE SHIPPING ON ORDERS OVER $100!,Dark
1,SOME EXCLUSIONS APPLY - LEARN MORE,Not_Dark
2,HAVE A QUESTION? - CONTACT US,Not_Dark
3,WELCOME TO 034MOTORSPORT!,Not_Dark
4,SHOP AUDISHOP VOLKSWAGENPERFORMANCE SOFTWARE03...,Not_Dark
5,SEARCH,Not_Dark
6,HOME,Not_Dark
7,/,Not_Dark
8,"ADJUSTABLE SOLID REAR SWAY BAR, 8J/8P AUDI TT/...",Not_Dark
9,MORE VIEWS,Not_Dark


In [5]:
# Remove the rows where the first letter starting with ignoring characters
ignore_str = [',', '.', ';', '{', '}', '#', '/', '?', '@','$','(',')']
data = data[~data['Pattern String'].str[0].isin(ignore_str)]

# data['Pattern String'] = data['Pattern String'].str.lower()
data.shape

Unnamed: 0,Pattern String,classification
0,free shipping on orders over $100!,Dark
1,some exclusions apply - learn more,Not_Dark
2,have a question? - contact us,Not_Dark
3,welcome to 034motorsport!,Not_Dark
4,shop audishop volkswagenperformance software03...,Not_Dark
5,search,Not_Dark
6,home,Not_Dark
8,"adjustable solid rear sway bar, 8j/8p audi tt/...",Not_Dark
9,more views,Not_Dark
10,$268.00,Not_Dark


In [7]:
X = data['Pattern String'].values
Y = data['classification'].values

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(Y)


(unique, counts) = np.unique(Y, return_counts=True)
frequencies_y_label = np.asarray((unique, counts)).T

print('The frequency distribution of y labels:\n',frequencies_y_label)


(unique, counts) = np.unique(y_encoded, return_counts=True)
frequencies_y_encoded = np.asarray((unique, counts)).T

print('The frequency distribution of y encoded labels:\n',frequencies_y_encoded)

The frequency distribution of y labels:
 [['Dark' 1481]
 ['Not_Dark' 1454]]
The frequency distribution of y encoded labels:
 [[   0 1481]
 [   1 1454]]


In [8]:
X.shape, Y.shape, y_encoded.shape

((2935,), (2935,), (2935,))

In [9]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer = Tokenizer(num_words=5000)   # num_words is the size of the vocabulary (top 5000 frequent words in the vocabulary)
tokenizer.fit_on_texts(X)    # Updates internal vocabulary based on a list of texts

X = tokenizer.texts_to_sequences(X)  # Transforms each text in texts to a sequence of integers with its corresponding integer value from the word_index dictionary

# The number of unique words in the whole training text
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index for padding

In [19]:
# Save the Tokenizer to disk
import joblib

joblib.dump(tokenizer, 'Presence_CV_Tokenizer.joblib')

['Presence_CV_Tokenizer.joblib']

In [10]:
def FindMaxLength(lst):
    maxList = max((x) for x in lst)
    maxLength = max(len(x) for x in lst )
  
    return maxList, maxLength

print(FindMaxLength(X))

([4132, 4, 255, 161, 25, 221, 91, 5, 259, 283, 762], 114)


In [11]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 20

X = pad_sequences(X, padding='post', maxlen=maxlen)

X.shape

(2935, 50)

In [12]:
from keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D

embedding_dim = 20
vocab_size = 5000
num_filters = 128
kernel_size = 2

def create_model(optimizer = 'adam'):
    model01 = Sequential()
    model01.add(Embedding(input_dim=vocab_size,          # input_dim is the size of the vocabulary
                               output_dim=embedding_dim,      # output_dim is the size of the dense vector
                               input_length=maxlen))          # input_length is the length of the sequence
    model01.add(Conv1D(num_filters,               # add a Convolution1D, which will learn filters
                     kernel_size,       # convolutional windows length, which mean will operate over 3 words every time
                     padding='valid',
                     activation='relu',
                     strides=1))
    model01.add(GlobalMaxPooling1D())    # use max pooling to reduce the size of the feature maps
    model01.add(Dense(200, activation='relu'))
    model01.add(Dropout(0.2))
    model01.add(Dense(1, activation='sigmoid'))
    model01.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model01

In [17]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from keras.callbacks import EarlyStopping
tf.config.run_functions_eagerly(True)

classifier = KerasClassifier(build_fn=create_model,epochs=5, batch_size=5,verbose=1)

seed = 2
# evaluate using 10-fold cross validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(classifier, X, Y, cv=kfold)

print(results)
print(results.mean())



0.87802126010259


In [18]:
results

array([0.88559753, 0.88445807, 0.86400819])

### Parameter Tunning
-----

In [21]:
from keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D

embedding_dim = 20
vocab_size = 5000
num_filters = 128
kernel_size = 2

def tune_model(optimizer = 'adam'):
    model02 = Sequential()
    model02.add(Embedding(input_dim=vocab_size,          # input_dim is the size of the vocabulary
                               output_dim=embedding_dim,      # output_dim is the size of the dense vector
                               input_length=maxlen))          # input_length is the length of the sequence
    model02.add(Conv1D(num_filters,               # add a Convolution1D, which will learn filters
                     kernel_size,       # convolutional windows length, which mean will operate over 3 words every time
                     padding='valid',
                     activation='relu',
                     strides=1))
    model02.add(GlobalMaxPooling1D())    # use max pooling to reduce the size of the feature maps
    model02.add(Dense(200, activation='relu'))
    model02.add(Dropout(0.2))
    model02.add(Dense(1, activation='sigmoid'))
    model02.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model02


In [23]:
# create model
model = KerasClassifier(build_fn=tune_model, verbose=0)

# grid search epochs, batch size and optimizer
optimizers = ['rmsprop', 'adam']
epochs = [5, 10, 15]
batches = [5, 10, 20]
param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# default is using 3-fold stratified cross validation

grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X, Y)

KeyboardInterrupt: 

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Save the best model to a HDF5 file

grid_result.save('best_CNN_CV_model.h5')