In [12]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')

In [13]:
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], enable=True)

In [14]:
__author__ = "Siddharth Achari Sharabu"
!pip install ipynb



In [15]:
import ipynb
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
import time
import numpy as np
import pickle
%run Utilities.ipynb import get_model, SEQUENCE_LENGTH, TEST_SIZE
%run Utilities.ipynb import BATCH_SIZE, EPOCHS, label2int

In [16]:
def load_data():
    texts, labels = [], []
    with open("data/spam.txt") as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [17]:
X, y = load_data()

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
pickle.dump(tokenizer, open("results/tokenizer.pickle", "wb"))
X = tokenizer.texts_to_sequences(X)
print(X[0])
X = np.array(X)
y = np.array(y)
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
print(X[0])

[49, 472, 4436, 843, 756, 659, 64, 8, 1328, 87, 123, 352, 1329, 148, 2996, 1330, 67, 58, 4437, 144]


  X = np.array(X)


[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0   49  472 4436  843
  756  659   64    8 1328   87  123  352 1329  148 2996 1330   67   58
 4437  144]


In [19]:
y = [ label2int[label] for label in y ]
y = to_categorical(y)
print(y[0])

[1. 0.]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)

In [21]:
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (4180, 100)
X_test.shape: (1394, 100)
y_train.shape: (4180, 2)
y_test.shape: (1394, 2)


In [22]:
model = get_model(tokenizer=tokenizer, lstm_units=128)
model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}.h5", save_best_only=True,verbose=1)
tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")

Reading GloVe: 400000it [00:22, 17660.34it/s]


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          901300    
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 1,018,806
Trainable params: 117,506
Non-trainable params: 901,300
_________________________________________________________________


In [23]:
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          batch_size=BATCH_SIZE, epochs=EPOCHS,
          callbacks=[tensorboard, model_checkpoint],
          verbose=1)

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.27557, saving model to results\spam_classifier_0.28.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.27557 to 0.07908, saving model to results\spam_classifier_0.08.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 0.07908
Epoch 4/20
Epoch 4: val_loss improved from 0.07908 to 0.06171, saving model to results\spam_classifier_0.06.h5
Epoch 5/20
Epoch 5: val_loss did not improve from 0.06171
Epoch 6/20
Epoch 6: val_loss improved from 0.06171 to 0.05976, saving model to results\spam_classifier_0.06.h5
Epoch 7/20
Epoch 7: val_loss did not improve from 0.05976
Epoch 8/20
Epoch 8: val_loss did not improve from 0.05976
Epoch 9/20
Epoch 9: val_loss improved from 0.05976 to 0.05311, saving model to results\spam_classifier_0.05.h5
Epoch 10/20
Epoch 10: val_loss did not improve from 0.05311
Epoch 11/20
Epoch 11: val_loss did not improve from 0.05311
Epoch 12/20
Epoch 12: val_loss did not improve from 0.05311
Epoch 13/20
Epoch 13: val_loss did no

<keras.callbacks.History at 0x190dc630e50>

In [25]:
result = model.evaluate(X_test, y_test)



In [27]:
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

In [28]:
print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision:   {precision*100:.2f}%")
print(f"[+] Recall:   {recall*100:.2f}%")

[+] Accuracy: 98.49%
[+] Precision:   98.49%
[+] Recall:   98.49%
