**Name: Rana Mohamed Ali**


**ID: 5**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences

# Loading the dataset

In [None]:
data = pd.read_csv('emails.csv')
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


# Exploring the dataset

In [None]:
data.shape

(5728, 2)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [None]:
data.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


# Preprocessing of the data

**cleaning the data**

In [None]:
# check if there are duplicates
data.duplicated().sum()

33

In [None]:
# drop duplicates
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [None]:
data.shape

(5695, 2)

In [None]:
data.isnull().sum()

Unnamed: 0,0
text,0
spam,0


Text processing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
stemmer = PorterStemmer()
# function that preprocess the text (converts to lowercase, remove punctuation, remove stopwords,etc.) in each email
def text_processing(text):
    tokens = []
    text = text.lower()
    # removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # tokenize the text
    tokens = word_tokenize(text)
    # removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens  if word not in stop_words]
    # stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    tokens = stemmed_tokens
    return tokens

In [None]:
data['processed_emails'] = data['text'].apply(text_processing)

In [None]:
print(data['processed_emails'][1])

['subject', 'stock', 'trade', 'gunsling', 'fanni', 'merril', 'muzo', 'colza', 'attaind', 'penultim', 'like', 'esmark', 'perspicu', 'rambl', 'segovia', 'group', 'tri', 'slung', 'kansa', 'tanzania', 'ye', 'chameleon', 'continu', 'clothesman', 'libretto', 'chesapeak', 'tight', 'waterway', 'herald', 'hawthorn', 'like', 'chisel', 'morristown', 'superior', 'deoxyribonucl', 'clockwork', 'tri', 'hall', 'incred', 'mcdougal', 'ye', 'hepburn', 'einsteinian', 'earmark', 'sapl', 'boar', 'duan', 'plain', 'palfrey', 'inflex', 'like', 'huzzah', 'pepperoni', 'bedtim', 'nameabl', 'attir', 'tri', 'edt', 'chronographi', 'optima', 'ye', 'pirogu', 'diffus', 'albeit']


In [None]:
# after text processing
data.head()

Unnamed: 0,text,spam,processed_emails
0,Subject: naturally irresistible your corporate...,1,"[subject, natur, irresist, corpor, ident, lt, ..."
1,Subject: the stock trading gunslinger fanny i...,1,"[subject, stock, trade, gunsling, fanni, merri..."
2,Subject: unbelievable new homes made easy im ...,1,"[subject, unbeliev, new, home, made, easi, im,..."
3,Subject: 4 color printing special request add...,1,"[subject, 4, color, print, special, request, a..."
4,"Subject: do not have money , get software cds ...",1,"[subject, money, get, softwar, cd, softwar, co..."


# Feature Extraction

**Bag Of Words**

In [None]:
# vocab size is maximum 10000
max_features = 10000
# max email length = 500
max_len = 500
vectorizer = CountVectorizer(max_features=max_features)
# fill x (containing emails text) after preprocessing and appling bag of words model
x = vectorizer.fit_transform((data['processed_emails']).apply(' '.join)).toarray()
# fill y (labels)
y = data['spam']

**Word Embedding (using word2vec model)**

In [None]:
def word_embedding(tokens, model, size):
    """
    This function takes a list of tokens (words), a Word2Vec model and the size of the embeddings.
    It returns the average word embedding for a sentence.
    """
    # list to store the embeddings for each word in the sentence
    words = []

    # iterate over tokens in the sentence
    for word in tokens:
        # check if the word is in the model vocab
        if word in model.wv:
            # if the word is in the vocab get its embedding and add it to the list
            words.append(model.wv[word])

    # if a word has embedding in the model
    if len(words) > 0:
        # calculate and return the average of all word embeddings in the sentence
        return np.mean(words, axis=0)
    else:
        # else no words from the sentence are in the model= vocab then return a zero vector
        return np.zeros(size)



In [None]:
from gensim.models import Word2Vec
# train the Word2Vec model on the processed emails
Word2Vec_model = Word2Vec(data['processed_emails'],
                 min_count=1, # ensures that if a word appears only once it will be included in the model
                 vector_size=128 # size of the output vector that will represent the word
                 )
# apply word embedding to all emails
data['WE'] = data['processed_emails'].apply(lambda x: word_embedding(x, Word2Vec_model, Word2Vec_model.vector_size))

In [None]:
# convert the embeddings into a array for training
X = np.vstack(data['WE'].values)

y = data['spam']

# Model Implementation

**Naive Bayes**

In [None]:
# implement the Naive Bayes classifier
nb_model = MultinomialNB()

**LSTM**

In [None]:
# lstm model implementation

lstm_model = Sequential()

# input layer -> word embeddings
lstm_model.add(Embedding(input_dim=max_features,  # vocab size = 10000
                         output_dim=100,          # dimension of the embedding vector of a word
                         input_length=max_len))


lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))  # sigmoid func for binary classification -> spam or not



In [None]:
# compile the model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Evaluation

Train-Test data split

In [None]:
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# padding to ensure all inputs of the same length
x_train = sequence.pad_sequences(x_train,maxlen = max_len)
x_test = sequence.pad_sequences(x_test,maxlen = max_len)

**Train Naive Bayes**

In [None]:
#Train the Naive Bayes model
nb_model.fit(x_train, y_train)

**Evaluate Naive Bayes**

In [None]:
y_train_pred = nb_model.predict(x_train)
y_test_pred = nb_model.predict(x_test)

In [None]:
# train accuracy
train_accuracy = accuracy_score(y_train,y_train_pred)
print("Naive Bayes accuracy during training:",train_accuracy)
# test accuracy
test_accuracy = accuracy_score(y_test,y_test_pred)
print("Naive Bayes accuracy during testing:",test_accuracy)

Naive Bayes accuracy during training: 0.8939859525899913
Naive Bayes accuracy during testing: 0.884108867427568


**Testing A sample**

In [None]:
# print the predictions
print(f"Email: {'Not Spam' if y_test.iloc[0] == 0 else 'Spam'}")
print(f"Prediction: {'Not Spam' if y_test_pred[0] == 0 else 'Spam'}\n")

print(f"Email: {'Not Spam' if y_test.iloc[19] == 0 else 'Spam'}")
print(f"Prediction: {'Not Spam' if y_test_pred[19] == 0 else 'Spam'}\n")

Email: Not Spam
Prediction: Not Spam

Email: Spam
Prediction: Spam



**Train LSTM**

In [None]:
# Train the LSTM model
lstm_model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 493ms/step - accuracy: 0.7398 - loss: 0.5703 - val_accuracy: 0.7752 - val_loss: 0.5329
Epoch 2/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 474ms/step - accuracy: 0.7529 - loss: 0.5614 - val_accuracy: 0.7752 - val_loss: 0.5370
Epoch 3/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 473ms/step - accuracy: 0.7596 - loss: 0.5542 - val_accuracy: 0.7752 - val_loss: 0.5411
Epoch 4/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 481ms/step - accuracy: 0.7567 - loss: 0.5563 - val_accuracy: 0.7752 - val_loss: 0.5360
Epoch 5/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 484ms/step - accuracy: 0.7643 - loss: 0.5472 - val_accuracy: 0.7752 - val_loss: 0.5323
Epoch 6/10
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 484ms/step - accuracy: 0.7663 - loss: 0.5424 - val_accuracy: 0.7752 - val_loss: 0.5374
Epoch 7/10

<keras.src.callbacks.history.History at 0x7e5439db1e40>

**Evaluating LSTM**

In [None]:
#predict , convert the probabilities to either 1 or 0
y_test_lstm_pred = (lstm_model.predict(x_test) > 0.5).astype("int32")

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 155ms/step


In [None]:
# evaluate lstm model
lstm_loss,lstm_acc = lstm_model.evaluate(x_test,y_test)
print("LSTM test accuracy: ",lstm_acc)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 126ms/step - accuracy: 0.7329 - loss: 0.5644
LSTM test accuracy:  0.74012291431427


**Evaluation metrics**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# function to evaluate the model
def evaluate(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy:",accuracy)
    print(f"Precision:",precision)
    print(f"Recall:",recall)
    print(f"F1 Score:",f1)


In [None]:
# evaluating naive bayes with metrics
print("Naive Bayes metrics:")
evaluate(y_test, y_test_pred)

Naive Bayes metrics:
Accuracy: 0.884108867427568
Precision: 0.8203125
Recall: 0.7094594594594594
F1 Score: 0.7608695652173914


In [None]:
# evaluating LSTM
print("LSTM metrics:")
evaluate(y_test, y_test_lstm_pred)


LSTM metrics:
Accuracy: 0.7401229148375769
Precision: 0.5
Recall: 0.010135135135135136
F1 Score: 0.019867549668874177


**Naive Bayes:**

**Pros:**


*   High Accuracy
*   Good recall and precision and work efficiently in case of independent conditional probability.

**Cons:**

assumes feature independence which might not always be the case.


---


**LSTM:**

**Pros:**


*   suitable if we are working with long contextual information in the data.

**Cons:**
*   lower accuracy and recall rate.


