In [18]:
import tensorflow as tf
from tensorflow.keras import models, layers
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [19]:
data_df = pd.read_csv("Spam_data.csv")
data_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
data_shuffled = data_df.sample(frac=1, random_state=42)
data_shuffled.head()

Unnamed: 0,Category,Message
3245,ham,Squeeeeeze!! This is christmas hug.. If u lik ...
944,ham,And also I've sorta blown him off a couple tim...
1044,ham,Mmm thats better now i got a roast down me! i...
2484,ham,Mm have some kanji dont eat anything heavy ok
812,ham,So there's a ring that comes with the guys cos...


In [21]:
data_shuffled['Category'] = data_shuffled['Category'].replace({'spam':0,'ham':1})

In [22]:
data_shuffled.head()

Unnamed: 0,Category,Message
3245,1,Squeeeeeze!! This is christmas hug.. If u lik ...
944,1,And also I've sorta blown him off a couple tim...
1044,1,Mmm thats better now i got a roast down me! i...
2484,1,Mm have some kanji dont eat anything heavy ok
812,1,So there's a ring that comes with the guys cos...


In [23]:
from sklearn.model_selection import train_test_split

X = data_shuffled['Message'].to_numpy()
y = data_shuffled['Category'].to_numpy()

train_sentences, val_sentences, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
train_sentences[:10]

array(['I am in tirupur da, once you started from office call me.',
       'Congratulations ur awarded 500 of CD vouchers or 125gift guaranteed & Free entry 2 100 wkly draw txt MUSIC to 87066 TnCs www.Ldew.com1win150ppmx3age16',
       'I will come tomorrow di',
       'Po de :-):):-):-):-). No need job aha.',
       'Text82228>> Get more ringtones, logos and games from www.txt82228.com. Questions: info@txt82228.co.uk',
       'Much better now thanks lol',
       'No she didnt. I will search online and let you know.',
       'Just chill for another 6hrs. If you could sleep the pain is not a surgical emergency so see how it unfolds. Okay',
       'He is a womdarfull actor',
       'SMS SERVICES. for your inclusive text credits, pls goto www.comuk.net login= 3qxj9 unsubscribe with STOP, no extra charge. help 08702840625.COMUK. 220-CM2 9AE'],
      dtype=object)

In [25]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

16

In [26]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=10000,
    output_sequence_length=16
)

In [27]:
text_vectorizer.adapt(train_sentences)

In [28]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 16), dtype=int64, numpy=
array([[ 389,    5, 6751,    9,   12, 1086,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]], dtype=int64)>

In [29]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"5 most common words: {top_5_words}")
print(f"5 least common words: {bottom_5_words}")

Number of words in vocab: 8479
5 most common words: ['', '[UNK]', 'i', 'to', 'you']
5 least common words: ['0125698789', '01223585334', '01223585236', '0121', '0089my']


In [30]:
embedding = tf.keras.layers.Embedding(
    input_dim=10000,
    output_dim=128,
    input_length=16
)

embedding

<keras.layers.embeddings.Embedding at 0x167368f21f0>

In [32]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\
        \n\nEmbedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
 Come by our room at some point so we can iron out the plan for this weekend        

Embedded version:


<tf.Tensor: shape=(1, 16, 128), dtype=float32, numpy=
array([[[-0.038249  , -0.01289154,  0.0306404 , ...,  0.01196621,
          0.00238883, -0.04631406],
        [-0.04649078,  0.00078639,  0.01028613, ...,  0.00056112,
          0.01427468, -0.00211804],
        [-0.02996655,  0.03114503, -0.02434107, ..., -0.02245229,
          0.02245172,  0.00476415],
        ...,
        [-0.01683529,  0.01376239, -0.02553462, ...,  0.03013388,
         -0.03608333,  0.01400602],
        [ 0.04613322,  0.04465889, -0.04678743, ...,  0.02316752,
         -0.0068737 ,  0.04350287],
        [-0.03750529,  0.02363577,  0.00883666, ..., -0.02447285,
          0.02656169,  0.03388977]]], dtype=float32)>

In [33]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.038249  , -0.01289154,  0.0306404 ,  0.01019611, -0.03077215,
         0.01924035,  0.0320631 ,  0.00991832,  0.01614109, -0.0460308 ,
        -0.00696546, -0.00579023, -0.01372055, -0.04923457, -0.00745581,
        -0.03160735, -0.02289295, -0.0240159 ,  0.04268349,  0.00999887,
         0.04384083, -0.04907345,  0.03463724, -0.0308336 ,  0.03123367,
         0.01625849,  0.0182979 , -0.01804769,  0.02508886,  0.04253796,
         0.00119212,  0.01594105, -0.0113885 , -0.0034619 , -0.04898969,
        -0.03013184, -0.04993705,  0.02361066, -0.03868522,  0.00672096,
         0.00861593,  0.04208613, -0.0254522 ,  0.02628628, -0.04480546,
         0.03554613,  0.0179359 , -0.01637862,  0.02282419,  0.04934869,
         0.01987066, -0.04797655, -0.00958093, -0.02175086,  0.00679664,
        -0.02502545,  0.03427105, -0.00089613,  0.00583258,  0.00697871,
        -0.02007295,  0.03627903, -0.03563932,  0.01312179, -0.03623333,
  

In [41]:
#----Models----
#0 - Naive Bayes
#1 - LSTM
#2 - GRU
#3 - Bidirectional LSTM
#4 - 1D CNN
#5 - Pretrained feature extractor
#6 - Model 6 with 10% of data

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

In [36]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 95.43%


In [37]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [38]:
train_labels

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [39]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    model_results = {'accuracy':model_accuracy,
                    'precision':model_precision,
                    'recall':model_recall,
                    'f1-score':model_f1}
    return model_results

In [40]:
baseline_results = calculate_results(val_labels, baseline_preds)

baseline_results

{'accuracy': 95.42600896860986,
 'precision': 0.9565835313454701,
 'recall': 0.9542600896860987,
 'f1-score': 0.9505836782903585}

In [45]:
#Model-1

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name='model_1_dense')

In [46]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 16)                0         
_________________________________________________________________
embedding (Embedding)        (None, 16, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [47]:
model_1.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

model_1_history = model_1.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data = (val_sentences, val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
model_1.evaluate(val_sentences, val_labels)



[0.0787719264626503, 0.9766815900802612]

In [50]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs

array([[0.7731793 ],
       [0.9108775 ],
       [0.99399054],
       ...,
       [0.99491394],
       [0.96786815],
       [0.9864151 ]], dtype=float32)

In [51]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds

<tf.Tensor: shape=(1115,), dtype=float32, numpy=array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)>

In [53]:
model_1_results = calculate_results(val_labels, model_1_preds)

model_1_results

{'accuracy': 97.66816143497758,
 'precision': 0.9770764231646449,
 'recall': 0.9766816143497757,
 'f1-score': 0.9758938637050117}

In [54]:
#Model-2

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name='model_2_LSTM')

In [55]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 16)                0         
_________________________________________________________________
embedding (Embedding)        (None, 16, 128)           1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________________________


In [56]:
model_2.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

model_2_history = model_2.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
model_2.evaluate(val_sentences, val_labels)



[0.07291274517774582, 0.9775784611701965]

In [59]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs

array([[0.9974929 ],
       [0.99866354],
       [0.99979967],
       ...,
       [0.9998292 ],
       [0.99962914],
       [0.9997578 ]], dtype=float32)

In [60]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds

<tf.Tensor: shape=(1115,), dtype=float32, numpy=array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)>

In [61]:
model_2_results = calculate_results(val_labels, model_2_preds)
model_2_results

{'accuracy': 97.75784753363229,
 'precision': 0.9774317362193562,
 'recall': 0.9775784753363229,
 'f1-score': 0.9774913196077624}

In [63]:
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')

In [64]:
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
], name='model_6_USE')

model_6.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [65]:
model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE (KerasLayer)             (None, 512)               256797824 
_________________________________________________________________
dense_3 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [66]:
model_6_history = model_6.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [67]:
model_6.evaluate(val_sentences, val_labels)



[0.045263320207595825, 0.9874439239501953]

In [68]:
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs

array([[0.66795266],
       [0.9752442 ],
       [0.9985291 ],
       ...,
       [0.99840254],
       [0.99925   ],
       [0.99877614]], dtype=float32)

In [69]:
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds

<tf.Tensor: shape=(1115,), dtype=float32, numpy=array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)>

In [70]:
model_6_results = calculate_results(val_labels, model_6_preds)
model_6_results

{'accuracy': 98.7443946188341,
 'precision': 0.9873863023534284,
 'recall': 0.9874439461883409,
 'f1-score': 0.9873112492451572}

In [72]:
results = pd.DataFrame({
    'baseline_results':baseline_results,
    'model_1_results':model_1_results,
    'model_2_results':model_2_results,
    'model_6_results':model_6_results
})

results.T

Unnamed: 0,accuracy,precision,recall,f1-score
baseline_results,95.426009,0.956584,0.95426,0.950584
model_1_results,97.668161,0.977076,0.976682,0.975894
model_2_results,97.757848,0.977432,0.977578,0.977491
model_6_results,98.744395,0.987386,0.987444,0.987311
