# Spam Ditection model

* tensorflow model
* scikit learn model

In [2]:
import pandas as pd
import numpy as np 

### Data is collected from the spam.csv

In [122]:
import pandas as pd

# Try different encodings until you find the correct one
encodings_to_try = ['utf-8', 'ISO-8859-1', 'latin1', 'cp1252']

for encoding in encodings_to_try:
    try:
        data = pd.read_csv("spam.csv", encoding=encoding)
        print("Successfully read using encoding:", encoding)
        break
    except UnicodeDecodeError:
        print("Encoding", encoding, "didn't work")
data

Encoding utf-8 didn't work
Successfully read using encoding: ISO-8859-1


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


Null columns are removed

In [5]:
nullcolumn = data.columns[data.isnull().any()]
data = data.drop(columns=nullcolumn)
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


The spam and ham is changed to the 1 and 0

In [7]:
replace_dict = {'spam' : 1, 'ham' : 0}
data['v1'] = data['v1'].replace(replace_dict)
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


#### using train test split the data is plited for the testing and the training purposes.

In [10]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(data["v1"],data["v2"],test_size = .30 , random_state = 42)

In [36]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(3900,)
(3900,)
(1672,)
(1672,)


In [12]:
x_train[:10], y_train[:10]

(708     1
 4338    0
 5029    0
 4921    0
 2592    0
 2275    0
 1424    0
 1216    1
 5211    0
 4743    0
 Name: v1, dtype: int64,
 708     To review and KEEP the fantastic Nokia N-Gage ...
 4338                   Just got outta class gonna go gym.
 5029    Is there coming friday is leave for pongal?do ...
 4921    Hi Dear Call me its urgnt. I don't know whats ...
 2592    My friend just got here and says he's upping h...
 2275           Is that on the telly? No its Brdget Jones!
 1424                    Yes.. now only saw your message..
 1216    You have 1 new voicemail. Please call 08719181...
 5211                      It is only yesterday true true.
 4743                               \Thinking of u ;) x\""
 Name: v2, dtype: object)

### Text vectorization using the tensorflow keras layer

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Before TensorFlow 2.6
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [14]:
#average no of tokens
round(sum([len(i.split()) for i in y_train])/len(y_train))

15

In [24]:
max_vocab_length = 10000
max_length = 15

# Create the TextVectorization layer
text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",  # or "binary", "tf-idf", etc., depending on your needs
    output_sequence_length=max_length,
)

text_vectorizer adapted to the model

In [26]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(y_train)

In [28]:
word_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = word_in_vocab[:5]
bottom_5_words = word_in_vocab[-5:]
word_in_vocab , top_5_words , bottom_5_words

(['',
  '[UNK]',
  'to',
  'i',
  'you',
  'a',
  'the',
  'u',
  'and',
  'in',
  'is',
  'me',
  'my',
  'for',
  'your',
  'of',
  'it',
  'have',
  'call',
  'on',
  'are',
  'that',
  'now',
  '2',
  'im',
  'so',
  'not',
  'but',
  'at',
  'or',
  'ur',
  'get',
  'will',
  'just',
  'do',
  'can',
  'be',
  'if',
  'with',
  'we',
  'this',
  'no',
  'its',
  'up',
  'free',
  'go',
  'when',
  'ltgt',
  '4',
  'ok',
  'from',
  'what',
  'dont',
  'all',
  'how',
  'know',
  'out',
  'then',
  'like',
  'am',
  'got',
  'ill',
  'come',
  'was',
  'only',
  'good',
  'time',
  'love',
  'text',
  'send',
  'there',
  'day',
  'want',
  'going',
  'txt',
  'home',
  'by',
  'still',
  'he',
  'need',
  'lor',
  'as',
  'one',
  'sorry',
  'see',
  'about',
  'r',
  'stop',
  'reply',
  'mobile',
  'da',
  'back',
  'our',
  'hi',
  'n',
  'today',
  'well',
  'please',
  'new',
  'think',
  'Ì',
  'cant',
  'an',
  'she',
  'tell',
  'phone',
  'later',
  'any',
  'her',
  'bee

### Embedding layer from the tensorflow

In [32]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim = max_vocab_length,
    output_dim = 128,
    embeddings_initializer ="uniform",
    input_length = max_length,
    name = "embedding")
embedding

<keras.src.layers.core.embedding.Embedding at 0x1c3fe647b90>

In [None]:
import random
random_sentence = random.choice(y_train)

print(f"radom sentence: {random_sentence}")

sample_embedded = embedding(text_vectorizer([random_sentence]))
sample_embedded

In [38]:
y_train[:2]

708     To review and KEEP the fantastic Nokia N-Gage ...
4338                   Just got outta class gonna go gym.
Name: v2, dtype: object

## Sklearn model using the TfidfVectorizer and MultinomialNB in the pipeline

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(y_train, x_train)

In [41]:
baseline_score = model_0.score(y_test, x_test)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 95.99%


In [42]:
baseline_preds = model_0.predict(y_test)
baseline_preds[:20]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

### Calculation of precision and recall and accuracy score

In [43]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

def calculate_results(y_true , y_pred):
    model_accuracy = accuracy_score(y_true , y_pred) * 100

    model_precision , model_recall , model_f1 , _ = precision_recall_fscore_support(y_true , y_pred,average = "weighted")
    model_results = {
        "accuracy" : model_accuracy,
        "precision" : model_precision,
        "recall" : model_recall,
        "f1" : model_f1
    }
    return model_results

In [44]:
baseline_results = calculate_results(y_true = x_test , y_pred=baseline_preds)
baseline_results

{'accuracy': 95.99282296650718,
 'precision': 0.9616945511206245,
 'recall': 0.9599282296650717,
 'f1': 0.9567614211061143}

## Tensorflow model for the model prediction

In [46]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,),dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs , outputs ,name = "model_1_densse")

In [213]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [214]:
model_1.summary()

Model: "model_1_densse"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (Text  (None, 15)                0         


 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 1280129 (4.88 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [215]:
model_1_history = model_1.fit(
    y_train,
    x_train,
    epochs = 5,
    validation_data = (y_test , x_test)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
model_1.evaluate(y_test , x_test)



[0.09269191324710846, 0.9724880456924438]

In [55]:
embed_weights = model_1.get_layer("embedding").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


In [216]:
model_1_pred_probs = model_1.predict(y_test)
model_1_pred_probs[:10]



array([[6.7256123e-01],
       [1.3051700e-02],
       [9.7295398e-01],
       [5.2309531e-01],
       [9.9999255e-01],
       [3.0881870e-03],
       [2.0475173e-02],
       [1.3739296e-03],
       [3.5868652e-05],
       [8.3282609e-03]], dtype=float32)

In [217]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0.], dtype=float32)>

In [218]:
model_1_results = calculate_results(y_true=x_test,
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 97.84688995215312,
 'precision': 0.9783919782794124,
 'recall': 0.9784688995215312,
 'f1': 0.9778849854534022}

In [219]:
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(7829, ['', '[UNK]', 'to', 'i', 'you', 'a', 'the', 'u', 'and', 'in'])

In [220]:
embed_weights = model_1.get_layer("embedding").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


### visualize the data in the 

In [221]:
# Code below is adapted from: https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
import io

# Create output writers
out_v = io.open("embedding_vectors.tsv", "w", encoding="utf-8")
out_m = io.open("embedding_metadata.tsv", "w", encoding="utf-8")

# Write embedding vectors and words to file
for num, word in enumerate(words_in_vocab):
  if num == 0:
     continue # skip padding token
  vec = embed_weights[num]
  out_m.write(word + "\n") # write words to file
  out_v.write("\t".join([str(x) for x in vec]) + "\n") # write corresponding word vector to file
out_v.close()
out_m.close()

# # Download files locally to upload to Embedding Projector
# try:
#   from google.colab import files
# except ImportError:
#   pass
# else:
#   files.download("embedding_vectors.tsv")
#   files.download("embedding_metadata.tsv")

In [231]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")

# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.Bidirectional(layers.LSTM(64))(x) # return vector for whole sequence
print(x.shape)
# x = layers.GlobalAveragePooling1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 128)


In [238]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [239]:
# Fit model
model_2_history = model_2.fit(y_train,
                              x_train,
                              epochs=4,
                              validation_data=(y_test, x_test),
                              )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [240]:
# Make predictions on the validation dataset
model_2_pred_probs = model_2.predict(y_test)
model_2_pred_probs.shape, model_2_pred_probs[:10] # view the first 10



((1672, 1),
 array([[6.7699458e-03],
        [6.5633375e-04],
        [9.9921429e-01],
        [7.1226083e-02],
        [9.9999845e-01],
        [1.8746797e-06],
        [6.2745953e-06],
        [2.1606957e-06],
        [2.4330791e-06],
        [6.1751971e-06]], dtype=float32))

In [241]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))

In [242]:
model_2_result = calculate_2_results = calculate_results(y_true = x_test,
                                        y_pred = model_2_preds)
model_2_result 

{'accuracy': 97.54784688995215,
 'precision': 0.9751450493645157,
 'recall': 0.9754784688995215,
 'f1': 0.9748952343685159}

In [237]:
# type(y_test[:0])
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
print(model_2_preds)

tf.Tensor([1. 0. 1. ... 0. 0. 0.], shape=(1672,), dtype=float32)


In [200]:
import pandas as pd

# Your input text
input_text = "how are you"

# Create a pandas Series from the input text
series = pd.Series([input_text])

model_sample_pred_probs = model_2.predict(series)
model_sample_preds = tf.squeeze(tf.round(model_sample_pred_probs))
# The variable model_2_pred_probs now contains the predicted probabilities
print(model_sample_preds)


tf.Tensor(0.0, shape=(), dtype=float32)


In [205]:
!pip install tensorflowjs

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflowjs
  Obtaining dependency information for tensorflowjs from https://files.pythonhosted.org/packages/78/77/f9a83027eca63ac777daf3c133a53f77c47139a25d236629d5634b0e2025/tensorflowjs-4.10.0-py3-none-any.whl.metadata
  Downloading tensorflowjs-4.10.0-py3-none-any.whl.metadata (3.1 kB)
Collecting flax<0.6.3,>=0.6.2 (from tensorflowjs)
  Using cached flax-0.6.2-py3-none-any.whl (189 kB)
Collecting importlib_resources>=5.9.0 (from tensorflowjs)
  Obtaining dependency information for importlib_resources>=5.9.0 from https://files.pythonhosted.org/packages/25/d4/592f53ce2f8dde8be5720851bd0ab71cc2e76c55978e4163ef1ab7e389bb/importlib_resources-6.0.1-py3-none-any.whl.metadata
  Downloading importlib_resources-6.0.1-py3-none-any.whl.metadata (4.0 kB)
Collecting jax>=0.3.16 (from tensorflowjs)
  Downloading jax-0.4.14.tar.gz (1.3 MB)
     ---------------------------------------- 0.0/1.3 MB ? eta -:--:--

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\win10\\AppData\\Roaming\\Python\\Python311\\site-packages\\~-mpy\\.libs\\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll'
Check the permissions.



In [212]:
import tensorflowjs as tfjs

  np.uint8, np.uint16, np.object, np.bool]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\win10\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\win10\AppData\Local\Temp\ipykernel_8728\1219052119.py", line 1, in <module>
    import tensorflowjs as tfjs
  File "C:\Users\win10\AppData\Roaming\Python\Python311\site-packages\tensorflowjs\__init__.py", line 21, in <module>
    from tensorflowjs import converters
  File "C:\Users\win10\AppData\Roaming\Python\Python311\site-packages\tensorflowjs\converters\__init__.py", line 21, in <module>
    from tensorflowjs.converters.converter import convert
  File "C:\Users\win10\AppData\Roaming\Python\Python311\site-packages\tensorflowjs\converters\converter.py", line 35, in <module>
    from tensorflowjs.converters import keras_h5_conversion as conversion
  File "C:\Users\win10\AppData\Roaming\Python\Python311\site-packages\tensorflowjs\converters\keras_h5_conver

In [211]:
import os
tfjs.converters.save_keras_model(
        model_2,
        os.path.join('/models','spamModel/')
        )

NameError: name 'tfjs' is not defined

In [197]:
type(y_test[:1])

pandas.core.series.Series

## Convolution model

In [150]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_5")

# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

# Compile Conv1D model
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of our 1D convolution model
model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_2 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding_5 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 conv1d (Conv1D)             (None, 11, 32)            20512     
                                                                 
 global_max_pooling1d (Glob  (None, 32)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense_10 (Dense)            (None, 1)              

In [151]:
# Fit the model
model_5_history = model_5.fit(y_train,
                              x_train,
                              epochs=5,
                              validation_data=(y_test, x_test)
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [153]:
# Make predictions on the validation dataset
model_5_pred_probs = model_5.predict(y_test)
model_5_pred_probs.shape, model_5_pred_probs[:10] # view the first 10



((1672, 1),
 array([[9.4827724e-01],
        [4.6196266e-04],
        [9.6759254e-01],
        [6.4812154e-01],
        [9.9999654e-01],
        [1.6038724e-03],
        [2.8326563e-03],
        [2.5044777e-03],
        [2.9368882e-04],
        [2.8296788e-03]], dtype=float32))

In [157]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds

<tf.Tensor: shape=(1672,), dtype=float32, numpy=array([1., 0., 1., ..., 0., 0., 0.], dtype=float32)>

In [158]:
model_5_results = calculate_results(y_true=x_test,
                                    y_pred = model_5_preds)
model_5_results

{'accuracy': 97.72727272727273,
 'precision': 0.97700601472003,
 'recall': 0.9772727272727273,
 'f1': 0.9767571482479271}

In [244]:
embed_weights2 = model_5.get_layer("embedding_5").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


In [245]:
# Code below is adapted from: https://www.tensorflow.org/tutorials/text/word_embeddings#retrieve_the_trained_word_embeddings_and_save_them_to_disk
import io

# Create output writers
out_v = io.open("embedding_vectors5.tsv", "w", encoding="utf-8")
out_m = io.open("embedding_metadata5.tsv", "w", encoding="utf-8")

# Write embedding vectors and words to file
for num, word in enumerate(words_in_vocab):
  if num == 0:
     continue # skip padding token
  vec = embed_weights2[num]
  out_m.write(word + "\n") # write words to file
  out_v.write("\t".join([str(x) for x in vec]) + "\n") # write corresponding word vector to file
out_v.close()
out_m.close()

# # Download files locally to upload to Embedding Projector
# try:
#   from google.colab import files
# except ImportError:
#   pass
# else:
#   files.download("embedding_vectors.tsv")
#   files.download("embedding_metadata.tsv")