<a href="https://colab.research.google.com/github/PifDaReal/jedha-deep-leaning/blob/main/spam_detector_gcolab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries for EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objects as go

# Importing libraries necessary for Model Building and Training
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import warnings
warnings.filterwarnings('ignore')

In [None]:
datas = pd.read_csv("https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/project/spam.csv", encoding='latin1', usecols=["v1", "v2"])

In [None]:
datas.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
datas["v1_indice"] = datas["v1"].apply(lambda x : 1 if x == "spam" else 0)
datas

Unnamed: 0,v1,v2,v1_indice
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [None]:
!python -m spacy download en_core_web_sm -q

2023-11-13 00:41:42.488845: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-13 00:41:42.488922: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-13 00:41:42.488963: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

nlp = en_core_web_sm.load()

In [None]:
datas["v2_clean"] = datas["v2"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
datas["v2_clean"] = datas["v2"].apply(lambda x: x.replace(" +"," ").lower().strip())# remove stop words and replace everyword with their lemma
datas["v2_clean"] = datas["v2"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [None]:
import re
datas["v2_clean"] = datas["v2_clean"].apply(lambda x: re.sub('[!\"#$%&()*+,-./:;<=>?@\[\]^_`{|}~\\\]+'," ", x))

datas

Unnamed: 0,v1,v2,v1_indice,v2_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah I think usf live
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,2nd time try 2 contact u U win å£750 Pound pr...
5568,ham,Will Ì_ b going to esplanade fr home?,0,Ì b esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,0,guy bitching I act like interested buy week free


In [None]:
tokenizer_v2 = tf.keras.preprocessing.text.Tokenizer(num_words=1000) # instanciate the tokenizer
# num_words indicates the number of words to keep in the tokenization
# keeps only the most common words

tokenizer_v2.fit_on_texts(datas.v2_clean) # fit the tokenizer on the texts
# in this step the tokenizer will list all unique tokens in the text
# and associate them with a specific integer.

# This step will effectively transform the texts into sequences of indices
datas["v2_encoded"] = tokenizer_v2.texts_to_sequences(datas.v2_clean)

# Sometimes the preprocessing removes all the words in a string (because they contain
# only stopwords for example) so we calculate the length in order to filter out
# those records
datas["len_v2"] = datas["v2_encoded"].apply(lambda x: len(x))
datas = datas[datas["len_v2"]!=0]

In [None]:
datas.head()

Unnamed: 0,v1,v2,v1_indice,v2_clean,v2_encoded,len_v2
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great...,"[287, 477, 453, 31, 55, 207, 75, 60]",8
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni,"[9, 217, 569, 314, 2]",5
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...,"[12, 322, 3, 570, 666, 43, 922, 454, 923, 19, ...",19
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c,"[2, 133, 167, 2, 44]",5
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah I think usf live,"[773, 1, 21, 715, 141]",5


In [None]:
datas_pad = tf.keras.preprocessing.sequence.pad_sequences(datas.v2_encoded, padding="post")

datas_pad

array([[287, 477, 453, ...,   0,   0,   0],
       [  9, 217, 569, ...,   0,   0,   0],
       [ 12, 322,   3, ...,   0,   0,   0],
       ...,
       [ 49,  99, 627, ...,   0,   0,   0],
       [120,   1,  15, ...,   0,   0,   0],
       [319,   0,   0, ...,   0,   0,   0]], dtype=int32)

In [None]:
#train test split
X_train, X_val, Y_train, Y_val = train_test_split(datas_pad,
													datas['v1_indice'],
													test_size = 0.3,
													random_state = 42)


In [None]:
# We'll use this to form a tensorflow dataset containing on the one hand
# the encoded texts and the labels.
train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
val_ds = tf.data.Dataset.from_tensor_slices((X_val, Y_val))

# We then organize the dataste per batch
train_ds = train_ds.shuffle(len(train_ds)).batch(32)
val_ds = val_ds.shuffle(len(val_ds)).batch(32)

In [None]:
# Build the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(tokenizer_v2.word_index) + 1, output_dim=32))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          246592    
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 247137 (965.38 KB)
Trainable params: 247137 (965.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
              metrics = ['accuracy'],
              optimizer = 'adam')

In [None]:
# Train the model
history = model.fit(train_ds,
					validation_data=val_ds,
					epochs=20,
					batch_size=32,
				)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

history = model.history
fig = go.Figure(data=[
                      go.Scatter(
                          y=history.history["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history.history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'
)
fig.show()

In [None]:
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

history = model.history
fig = go.Figure(data=[
                      go.Scatter(
                          y=history.history["accuracy"],
                          name="Training accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history.history["val_accuracy"],
                          name="Validation accuracy",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val accuracy across epochs',
    xaxis_title='epochs',
    yaxis_title='Accuracy'
)
fig.show()