In [1]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd


In [2]:
data = pd.read_pickle('preprocessed_data2.pkl')

In [3]:
textual_columns = ['sender', 'receiver', 'subject', 'body']
data['combined_text'] = data[textual_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

data = data[['combined_text', 'label']] 

X = data['combined_text'].values
y = data['label'].values

In [23]:
# Tokenizing and padding
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100)

In [24]:
# pickle tokenizer
import pickle
with open('tokenizer_cnn.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [5]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [6]:
# Model building
model = Sequential()
model.add(Embedding(5000, 50, input_length=100))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

In [7]:
# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
# Model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           250000    
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           32128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 282,257
Trainable params: 282,257
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Training the model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x169819b7160>

In [10]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)



In [11]:
predictions = model.predict(X_test)
predictions = [1 if x > 0.5 else 0 for x in predictions]



In [12]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef, log_loss, brier_score_loss
roc_auc = roc_auc_score(y_test, predictions)
print('ROC AUC=%.3f' % (roc_auc))

kappa = cohen_kappa_score(y_test,predictions)
print('Cohens kappa: %f' % kappa)

mcc = matthews_corrcoef(y_test, predictions)
print('MCC: %f' % mcc)

log_loss = log_loss(y_test, predictions)
print('Log Loss: %f' % log_loss)

brier_score = brier_score_loss(y_test, predictions)
print('Brier Score: %f' % brier_score)

ROC AUC=0.985
Cohens kappa: 0.969534
MCC: 0.969537
Log Loss: 0.548364
Brier Score: 0.015214


In [13]:
from sklearn.metrics import classification_report
my_label_data=["Ham","Spam"]
print(classification_report(y_test,predictions,target_names=my_label_data))

              precision    recall  f1-score   support

         Ham       0.99      0.98      0.99     18256
        Spam       0.98      0.99      0.98     16975

    accuracy                           0.98     35231
   macro avg       0.98      0.98      0.98     35231
weighted avg       0.98      0.98      0.98     35231



In [26]:
model.save('cnn_model_weights.h5')

# save custom model
model.save('cnn_model.h5', save_format='h5')

In [27]:
loaded_model = tf.keras.models.load_model('cnn_model_weights.h5')

In [28]:
model.predict(tokenizer.texts_to_sequences(['You have won a lottery of $1000. Please contact us to claim your prize.']))



array([[0.9993507]], dtype=float32)

In [29]:
loaded_tokenizer = pickle.load(open('tokenizer_cnn.pkl', 'rb'))

loaded_model.predict(loaded_tokenizer.texts_to_sequences(['You have won a lottery of $1000. Please contact us to claim your prize.']))

ValueError: in user code:

    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\tarik\miniconda3\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 100), found shape=(None, 14)
