In [32]:
import re
import nltk
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [33]:
# Load dataset
data = pd.read_csv("Spam_Email_Data.csv")

In [34]:
# Display first 5 rows
data.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [35]:
# Display data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


In [36]:
# Data preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    #Text text cleansing
    # Remove Email header
    text =re.sub(r"^(From|To|Subject|Cc|Bcc|Date|Return-Path|Received|Message-ID):.*?\n","",text)

    # Remove HTML header
    text = re.sub(r"<.*?>", "", text)

     # Remove website URLs
    text = re.sub(r"http\S+", "", text)

    # Remove NoN-alphapectic
    text = re.sub(r'[^a-zA-Z\s]', "", text)


     # Remove dates
    text = re.sub(r"\b(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s(?:[0-9]|[0-2][0-9]|3[0-1])\s(?:[0-9]{2})?(?:[0-9]{2})?\s(?:[0-1][0-9]|2[0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?\s(?:\+[0-9]{4}|\-[0-9]{4}|\s[A-IK-Z])?\b", "", text)


    # Convert text to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords and stemming
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # Joining tokens back into text
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

# Apply preprocessing to the 'Text' column
data['text'] = data['text'].apply(preprocess_text)


In [37]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.4, random_state=50)


In [38]:
# Model Training (Neural networks based)
import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument



In [39]:
# Build Word2Vec model
sentences=[row.split() for row in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100)


In [40]:
# Word2Vec embedding for a text
def get_word2vec_embedding(text):
    words = text.split()
    embedding = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)
    return embedding


In [41]:
import numpy as np

In [42]:
# Convert text data into Word2Vec and Doc2Vec embeddings
X_train_word2vec = np.array([get_word2vec_embedding(text) for text in X_train])
X_test_word2vec = np.array([get_word2vec_embedding(text) for text in X_test])


In [43]:
# Build Doc2Vec model
tagged_documents = [TaggedDocument(words=word_tokenize(text), tags=[i]) for i, text in enumerate(X_train)]
doc2vec_model = Doc2Vec(tagged_documents, vector_size=100)


In [44]:
# Doc2Vec embedding for a text
def get_doc2vec_embedding(text):
    words = text.split()
    embedding = doc2vec_model.infer_vector(words)
    return embedding

In [45]:
# Generate Doc2Vec embeddings for the training data
X_train_doc2vec = np.array([get_doc2vec_embedding(text) for text in X_train])

# Generate Doc2Vec embeddings for the testing data
X_test_doc2vec = np.array([get_doc2vec_embedding(text) for text in X_test])


In [46]:
# Train Decision Tree for word2vec
decision_tree_classifier_word2vec = DecisionTreeClassifier()
decision_tree_classifier_word2vec.fit(X_train_word2vec, y_train)
y_pred_decision_tree_word2vec=decision_tree_classifier_word2vec.predict(X_test_word2vec)

In [47]:
# Train Decision Tree for Doc2vec
decision_tree_classifier_doc2vec = DecisionTreeClassifier()
decision_tree_classifier_doc2vec.fit(X_train_doc2vec, y_train)
y_pred_decision_tree_doc2vec=decision_tree_classifier_doc2vec.predict(X_test_doc2vec)


In [48]:
# Train Logistic regression for word2vec
logistic_regression_classifier_word2vec = LogisticRegression()
logistic_regression_classifier_word2vec.fit(X_train_word2vec, y_train)
y_pred_logistic_regression_word2vec=logistic_regression_classifier_word2vec.predict(X_test_word2vec)


In [49]:
# Train Logistic regression for Dord2vec
logistic_regression_classifier_doc2vec = LogisticRegression()
logistic_regression_classifier_doc2vec.fit(X_train_doc2vec, y_train)
y_pred_logistic_regression_doc2vec=logistic_regression_classifier_doc2vec.predict(X_test_doc2vec)


In [50]:
#Non nueral text embedding
#Bag of word method
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the documents into BoW vectors
X_BOW = vectorizer.fit_transform(data['text'])

In [51]:
# Splitting the dataset
X_train_BOW, X_test_BOW, y_train_BOW, y_test_BOW = train_test_split(X_BOW, data['target'], test_size=0.4, random_state=50)

In [52]:
#TF-IDF method
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the documents into TF-IDF vectors
X_TFIDF = vectorizer.fit_transform(data['text'])


In [53]:
# Splitting the dataset
X_train_TFIDF, X_test_TFIDF, y_train_TFIDF, y_test_TFIDF = train_test_split(X_TFIDF, data['target'], test_size=0.4, random_state=50)

In [54]:
# Train Logistic regression for BOW
logistic_regression_classifier_BOW = LogisticRegression()
logistic_regression_classifier_BOW.fit(X_train_BOW, y_train_BOW)
y_pred_logistic_regression_BOW=logistic_regression_classifier_BOW.predict(X_test_BOW)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
# Train Logistic regression for TFIDF
logistic_regression_classifier_TFIDF = LogisticRegression()
logistic_regression_classifier_TFIDF.fit(X_train_TFIDF, y_train_TFIDF)
y_pred_logistic_regression_TFIDF=logistic_regression_classifier_TFIDF.predict(X_test_TFIDF)


In [56]:
# Train Decision Tree for BOW
decision_tree_classifier_BOW = DecisionTreeClassifier()
decision_tree_classifier_BOW.fit(X_train_BOW, y_train_BOW)
y_pred_decision_tree_BOW=decision_tree_classifier_BOW.predict(X_test_BOW)

In [57]:
# Train Decision Tree for TFIDF
decision_tree_classifier_TFIDF = DecisionTreeClassifier()
decision_tree_classifier_TFIDF.fit(X_train_TFIDF, y_train_TFIDF)
y_pred_decision_tree_TFIDF=decision_tree_classifier_TFIDF.predict(X_test_TFIDF)

In [58]:
result=[]
def model_test(model,y_test,y_pred):
    accuracy_test=accuracy_score(y_test,y_pred)
    precision_test=precision_score(y_test,y_pred)

    result.append({
        'Model':model,
        'Test accuracy': accuracy_test,
        'Test precision': precision_test
    })


In [59]:
model_test("logistic_regression_classifier_word2vec",y_test,y_pred_logistic_regression_word2vec)
model_test("y_pred_decision_tree_word2vec",y_test,y_pred_decision_tree_word2vec)
model_test("y_pred_logistic_regression_doc2vec",y_test,y_pred_logistic_regression_doc2vec)
model_test("decision_tree_classifier_doc2vec",y_test,y_pred_decision_tree_doc2vec)

model_test("logistic_regression_classifier_BOW",y_test_BOW,y_pred_logistic_regression_BOW)
model_test("decision_tree_classifier_BOW",y_test_BOW,y_pred_decision_tree_BOW)
model_test("logistic_regression_classifier_TFIDF",y_test_TFIDF,y_pred_logistic_regression_TFIDF)
model_test("decision_tree_classifier_TFIDF",y_test_TFIDF,y_pred_decision_tree_TFIDF)

In [60]:
models_result=pd.DataFrame(result)
print("\nModel Evaluation Results: \n")
print(models_result)



Model Evaluation Results: 

                                     Model  Test accuracy  Test precision
0  logistic_regression_classifier_word2vec       0.984045        0.987552
1            y_pred_decision_tree_word2vec       0.976283        0.964817
2       y_pred_logistic_regression_doc2vec       0.957309        0.943448
3         decision_tree_classifier_doc2vec       0.837861        0.729323
4       logistic_regression_classifier_BOW       0.994825        0.994580
5             decision_tree_classifier_BOW       0.978439        0.965054
6     logistic_regression_classifier_TFIDF       0.982320        0.992968
7           decision_tree_classifier_TFIDF       0.982320        0.970470


In [61]:
# Save dataframe to CSV file
models_result.to_csv('model_evaluation_results.csv', index=False)