In [2]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

# Tokenize and preprocess the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Create TaggedDocument objects
tagged_data = [TaggedDocument(words=text, tags=[str(i)]) for i, text in enumerate(data['clean_text'])]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Function to get document embeddings
def get_doc_embedding(tokens):
    return doc2vec_model.infer_vector(tokens)

# Get document embeddings for all documents
# data['doc_embedding'] = data['clean_text'].apply(get_doc_embedding)

# Splitting the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(data['doc_embedding'].to_list(), data['target'], test_size=0.2, random_state=43)
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.2, random_state=43)

X_train = X_train.apply(get_doc_embedding).to_list()
X_test = X_test.apply(get_doc_embedding).to_list()

# Train Logistic Regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Train Decision Tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Evaluate models
log_reg_pred = log_reg_model.predict(X_test)
tree_pred = tree_model.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, log_reg_pred))

print("Decision Tree:")
print(classification_report(y_test, tree_pred))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       795
           1       0.98      0.97      0.97       365

    accuracy                           0.98      1160
   macro avg       0.98      0.98      0.98      1160
weighted avg       0.98      0.98      0.98      1160

Decision Tree:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       795
           1       0.83      0.80      0.82       365

    accuracy                           0.89      1160
   macro avg       0.87      0.86      0.87      1160
weighted avg       0.89      0.89      0.89      1160

