## DETECTING SPAM MAILS USING KERAS AND GLOVE6B WORD EMBEDDINGS

In [45]:
import tqdm
import numpy as np
import keras_metrics # for recall and precision metrics
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
import time
import numpy as np
import pickle
print('Modules imported')

Modules imported


In [2]:
# Parameters
SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS = 20 # number of epochs

# to convert labels to integers and vice-versa
label2int = {"ham": 0, "spam": 1}
int2label = {0: "ham", 1: "spam"}

#  Load dataset

In [3]:
def load_data():
    """
    Loads SMS Spam Collection dataset
    """
    texts, labels = [], []
    with open("data/SMSSpamCollection") as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [4]:
X, y = load_data()
print(X[:3])
print(y[:3])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
['ham', 'ham', 'spam']


# Preparing the Dataset

Now, we need a way to vectorize the text corpus by turning each text into a sequence of integers, you're now may be wondering why we need to turn the text into sequence of integers, well, remember we are going to feed the text into a neural network, a neural network only understands numbers. More precisely, a fixed length sequence of integers.

But before we do all of that, we need to clean this corpus by removing punctuations, lowercase all characters, etc. Luckily for us, Keras has a builtin class keras.preprocessing.text.Tokenizer() that does all that in few lines of code:

In [5]:
# Text tokenization
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)

In [9]:
X[0]

[49,
 472,
 4436,
 843,
 756,
 659,
 64,
 8,
 1328,
 87,
 123,
 352,
 1329,
 148,
 2996,
 1330,
 67,
 58,
 4437,
 144]

In [10]:
# convert to numpy arrays
X = np.array(X)
y = np.array(y)
# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)

In [11]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,   49,  472, 4436,  843,  756,  659,   64,    8,
       1328,   87,  123,  352, 1329,  148, 2996, 1330,   67,   58, 4437,
        144])

As you may remember, we set SEQUENCE_LENGTH to 100, in this way, all sequences have a length of 100.

Now our labels are text also, but we gonna make a different approach here, since the labels are only "spam" and "ham", we need to one-hot encode them:

In [12]:
# One Hot encoding labels
# [spam, ham, spam, ham, ham] will be converted to:
# [1, 0, 1, 0, 1] and then to:
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]

y = [ label2int[label] for label in y ]
y = to_categorical(y)

In [13]:
y[0]

array([1., 0.], dtype=float32)

In [14]:
# Spiliting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4180, 100)
(1394, 100)
(4180, 2)
(1394, 2)


# Building the Model
<img src='data/img/network_diagram.png' height=500 width=500/> ![LSTM nueral net](data/img/network_diagram.png)

The first layer is a pre-trained embedding layer that maps each word to a N-dimensional vector of real numbers ( the EMBEDDING_SIZE corresponds to the size of this vector, in this case 100). Two words that have similar meaning tend to have very close vectors.

In [17]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In order to download the glove.6b pre-trained model follow the below link
http://nlp.stanford.edu/data/glove.6B.zip

# Define a model

In [19]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])
    model.summary()
    return model

The above function constructs the whole model, we loaded the pre-trained embedding vectors to the Embedding layer, and set trainable=False, this will freeze the embedding weights during the training process.


<img src='data/img/definition.png' height=500 width=500/> ![LSTM nueral net](data/img/definition.png)

In [20]:
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading GloVe: 400000it [00:34, 11491.43it/s]


tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fn
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          901300    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 1,018,806
Trainable params: 117,506
Non-trainable params: 901,300
____________________________

In [22]:
# initialize our ModelCheckpoint and TensorBoard callbacks
# model checkpoint for saving best weights
model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
                                    verbose=1)
# for better visualization
tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
# print our data shapes
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)
# train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          batch_size=BATCH_SIZE, epochs=EPOCHS,
          callbacks=[tensorboard, model_checkpoint],
          verbose=1)

X_train.shape: (4180, 100)
X_test.shape: (1394, 100)
y_train.shape: (4180, 2)
y_test.shape: (1394, 2)
Train on 4180 samples, validate on 1394 samples
Epoch 1/20
 128/4180 [..............................] - ETA: 1:08 - loss: 0.0881 - accuracy: 0.9531 - precision: 0.9492 - recall: 1.000 - ETA: 53s - loss: 0.0800 - accuracy: 0.9688 - precision: 0.9615 - recall: 0.9956 




Epoch 00001: val_loss improved from inf to 0.09662, saving model to results/spam_classifier_0.10
Epoch 2/20



Epoch 00002: val_loss improved from 0.09662 to 0.07071, saving model to results/spam_classifier_0.07
Epoch 3/20



Epoch 00003: val_loss did not improve from 0.07071
Epoch 4/20



Epoch 00004: val_loss did not improve from 0.07071
Epoch 5/20



Epoch 00005: val_loss improved from 0.07071 to 0.06440, saving model to results/spam_classifier_0.06
Epoch 6/20



Epoch 00006: val_loss improved from 0.06440 to 0.06249, saving model to results/spam_classifier_0.06
Epoch 7/20



Epoch 00007: val_loss did not improve from 0.06249
Epoch 8/20



Epoch 00008: val_loss improved from 0.06249 to 0.06081, saving model to results/spam_classifier_0.06
Epoch 9/20



Epoch 00009: val_loss improved from 0.06081 to 0.05901, saving model to results/spam_classifier_0.06
Epoch 10/20



Epoch 00010: val_loss did not improve from 0.05901
Epoch 11/20



Epoch 00011: val_loss did not improve from 0.05901
Epoch 12/20



Epoch 00012: val_loss did not improve from 0.05901
Epoch 13/20



Epoch 00013: val_loss did not improve from 0.05901
Epoch 14/20



Epoch 00014: val_loss did not improve from 0.05901
Epoch 15/20



Epoch 00015: val_loss did not improve from 0.05901
Epoch 16/20



Epoch 00016: val_loss did not improve from 0.05901
Epoch 17/20



Epoch 00017: val_loss did not improve from 0.05901
Epoch 18/20



Epoch 00018: val_loss did not improve from 0.05901
Epoch 19/20



Epoch 00019: val_loss did not improve from 0.05901
Epoch 20/20



Epoch 00020: val_loss did not improve from 0.05901


<keras.callbacks.callbacks.History at 0x2ab87d2cfc8>

In [37]:
model.save('spam_classifier_20_epoch.h5')

In [31]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)
# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision:   {precision*100:.2f}%")
print(f"[+] Recall:   {recall*100:.2f}%")

[+] Accuracy: 98.49%
[+] Precision:   98.94%
[+] Recall:   99.60%


In [50]:
def get_predictions(text):
    model.load_weights("spam_classifier_20_epoch.h5")
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    return int2label[np.argmax(prediction)]

In [51]:
text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
print(get_predictions(text))

text = "Hi man, Meet you tommorow 3.00 pm"
print(get_predictions(text))

spam
ham
