## Loading Spam Data

In [2]:
import pandas as pd
import os


#Load Spam Data and review content
spam_data = pd.read_csv("Spam-Classification.csv")

print("\nLoaded Data :\n------------------------------------")
print(spam_data.head())

#Separate feature and target data
spam_classes_txt = spam_data["CLASS"]
spam_messages = spam_data["SMS"]


Loaded Data :
------------------------------------
  CLASS                                                SMS
0   ham   said kiss, kiss, i can't do the sound effects...
1   ham      &lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2  spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3  spam  * FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4  spam  **FREE MESSAGE**Thanks for using the Auction S...


##  Preprocessing Spam Data

In [8]:
#Build a label encoder for target variable to convert strings to numeric values.
from sklearn import preprocessing
import tensorflow as tf

label_encoder = preprocessing.LabelEncoder()
spam_classes_binary = label_encoder.fit_transform(
                                spam_classes_txt)

#Convert target to one-hot encoding vector
spam_classes_one_hot = tf.keras.utils.to_categorical(spam_classes_binary,2)

print("One-hot Encoding Shape : ", spam_classes_one_hot.shape)

One-hot Encoding Shape :  (1500, 2)


In [9]:
spam_classes_binary

array([0, 0, 1, ..., 0, 0, 1])

In [10]:
spam_classes_one_hot

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [11]:
#Preprocess data for spam messages
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Max words in the vocabulary for this dataset
VOCAB_WORDS=10000
#Max sequence length for word sequences
MAX_SEQUENCE_LENGTH=100

#Create a vocabulary with unique words and IDs
spam_tokenizer = Tokenizer(num_words=VOCAB_WORDS)
spam_tokenizer.fit_on_texts(spam_messages)


print("Total unique tokens found: ", len(spam_tokenizer.word_index))
print("Example token ID for word \"kiss\" :", spam_tokenizer.word_index.get("kiss"))

#Convert sentences to token-ID sequences
spam_sequences = spam_tokenizer.texts_to_sequences(spam_messages)

#Pad all sequences to fixed length
spam_padded = pad_sequences(spam_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print("\nTotal sequences found : ", len(spam_padded))
print("Example Sequence for sentence : ", spam_messages[0] )
print(spam_padded[0])

Total unique tokens found:  4688
Example token ID for word "kiss" : 921

Total sequences found :  1500
Example Sequence for sentence :   said kiss, kiss, i can't do the sound effects! He is a gorgeous man isn't he! Kind of person who needs a smile to brighten his day! 
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  260  921  921    4  430   55    6 1488 2294  148   10
    3 1489  464 1143  148  922   19  514   77 1144    3  515    1 2295
  397   89]


## Split into training and test data

In [14]:

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
                                    spam_padded,spam_classes_one_hot,test_size=0.2)


## Building the embeddding matrix

In [20]:
#Load the pre-trained embeddings

import numpy as np

#Read pretrained embeddings into a dictionary
glove_dict = {} 

#Loading a 50 feature (dimension) embedding with 6 billion words
with open('glove.6B.50d.txt', "r", encoding="utf8") as glove_file:     
    for line in glove_file:
        
        emb_line = line.split()      
        emb_token = emb_line[0]         
        emb_vector = np.array(emb_line[1:], dtype=np.float32)
        
        if emb_vector.shape[0] == 50:    
            glove_dict[emb_token] = emb_vector 
print("Dictionary Size: ", len(glove_dict))

Dictionary Size:  400000


## Test the entry for the word "sky"

In [21]:
print("\n Sample Dictionary Entry for word \"sky\" :\n", glove_dict.get("sky"))


 Sample Dictionary Entry for word "sky" :
 [ 0.081092   0.94466    0.33658    0.42124    0.27977   -0.73385
 -0.97879   -0.52544    0.13249   -0.2126     0.41312    0.19676
  0.12114    0.87748   -0.16792    0.79765   -0.18026    0.23597
 -1.9492    -0.84402    0.15311    1.0843     0.52439   -0.28308
  0.17648   -0.37219   -0.68172    1.4701     0.48146    0.028964
  1.9263     0.55726    0.092331  -0.2266    -0.41086   -0.23616
 -0.12419   -1.0425    -0.22734   -0.58257    0.58536    0.20313
 -0.2065    -0.41059   -0.39159    0.12677    0.10595   -0.52283
 -0.0062389 -0.56913  ]


In [23]:
#We now associate each token ID in our data set vocabulary to the corresponding embedding in Glove
#If the word is not available, then embedding will be all zeros.

#Matrix with 1 row for each word in the data set vocubulary and 50 features

vocab_len = len(spam_tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_len, 50))

for word, id in spam_tokenizer.word_index.items():  
    try:
        embedding_vector = glove_dict.get(word) 
        if embedding_vector is not None:         
            embedding_matrix[id] = embedding_vector
    except:
        pass

print("Size of Embedding matrix :", embedding_matrix.shape)
print("Embedding Vector for word \"hi\" : \n", embedding_matrix[spam_tokenizer.word_index.get("hi")])

Size of Embedding matrix : (4689, 50)
Embedding Vector for word "hi" : 
 [-0.54312998  0.34426999  0.27125001  1.04869998 -1.16419995 -1.27219999
  0.35780999 -0.56527001 -0.29879001  0.85179001  0.52222002 -0.0019718
 -0.46434999  0.033631    0.048367    0.78762001  0.075995    0.51577002
  0.34777999  0.53802001  0.28299001 -0.1313     -0.073753    0.42614001
  0.030954   -0.55032998 -0.99789    -0.28946999  0.30517    -1.11940002
  1.29569995  0.91165     0.32222     0.93405002 -0.34152001 -0.62712997
 -0.092165    0.50901002  0.29203999 -0.20122001  0.19614001 -0.45881999
  1.1099     -0.68737     1.57239997 -0.10446     0.23593999 -0.56594002
  0.43676001  0.98092997]


In [24]:
spam_tokenizer.word_index

{'to': 1,
 'you': 2,
 'a': 3,
 'i': 4,
 'call': 5,
 'the': 6,
 'your': 7,
 'u': 8,
 'for': 9,
 'is': 10,
 'and': 11,
 'now': 12,
 'free': 13,
 'or': 14,
 '2': 15,
 'have': 16,
 'in': 17,
 'on': 18,
 'of': 19,
 'txt': 20,
 'ur': 21,
 '4': 22,
 'with': 23,
 'are': 24,
 'me': 25,
 'from': 26,
 'my': 27,
 'text': 28,
 'just': 29,
 'get': 30,
 'stop': 31,
 'this': 32,
 'mobile': 33,
 'reply': 34,
 'that': 35,
 'claim': 36,
 'no': 37,
 'be': 38,
 'so': 39,
 'it': 40,
 'only': 41,
 'out': 42,
 'our': 43,
 'www': 44,
 'will': 45,
 'prize': 46,
 'we': 47,
 'send': 48,
 'not': 49,
 'new': 50,
 'if': 51,
 'can': 52,
 'cash': 53,
 'at': 54,
 'do': 55,
 'but': 56,
 'won': 57,
 '1': 58,
 'all': 59,
 '150p': 60,
 'week': 61,
 "i'm": 62,
 't': 63,
 'msg': 64,
 'nokia': 65,
 'go': 66,
 'uk': 67,
 'win': 68,
 'please': 69,
 'as': 70,
 'been': 71,
 'know': 72,
 'urgent': 73,
 'tone': 74,
 'like': 75,
 'when': 76,
 'who': 77,
 'up': 78,
 'contact': 79,
 'com': 80,
 'want': 81,
 'by': 82,
 'service': 83,
 

## Build the Spam Model with Embeddings

In [25]:
#Create a model
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.regularizers import l2
from keras.layers import LSTM,Dense

#Setup Hyper-Parameters for building the model
NUM_CLASSES=2

model = tf.keras.models.Sequential()

model.add(keras.layers.Embedding(vocab_len,
                                 50, 
                                 name="Embedding-Layer",
                                 weights=[embedding_matrix],
                                 input_length=MAX_SEQUENCE_LENGTH,
                                 trainable=True))

#Add LSTM Layer
model.add(LSTM(256))
model.add(keras.layers.Flatten())

model.add(keras.layers.Dense(NUM_CLASSES,
                             name='Output-Layer',
                             activation='softmax'))

model.compile(loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding-Layer (Embedding)  (None, 100, 50)          234450    
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 Output-Layer (Dense)        (None, 2)                 514       
                                                                 
Total params: 549,332
Trainable params: 549,332
Non-trainable params: 0
_________________________________________________________________


## Evaluate the model

In [26]:
#Make it verbose so we can see the progress
VERBOSE=1

#Setup Hyper Parameters for training
BATCH_SIZE=256
EPOCHS=10
VALIDATION_SPLIT=0.2

print("\nTraining Progress:\n------------------------------------")

history=model.fit(X_train,
          Y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=VERBOSE,
          validation_split=VALIDATION_SPLIT)

print("\nEvaluation against Test Dataset :\n------------------------------------")
model.evaluate(X_test,Y_test)




Training Progress:
------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Evaluation against Test Dataset :
------------------------------------


[0.12025237083435059, 0.9599999785423279]

## Predicting Spam

In [34]:
# Two input strings to predict
input_str=["enter the free auction link to get a free 2000 dollors gift",
            "Do not call me again"]

#Convert to sequence using the same tokenizer as training
input_seq = spam_tokenizer.texts_to_sequences(input_str)
#Pad the input
input_padded = pad_sequences(input_seq, maxlen=MAX_SEQUENCE_LENGTH)

#Predict using model
prediction=np.argmax( model.predict(input_padded), axis=1 )
print("Prediction Output:" , prediction)

#Print prediction classes
print("Prediction Classes are ", label_encoder.inverse_transform(prediction))

Prediction Output: [1 0]
Prediction Classes are  ['spam' 'ham']
