In [9]:
 import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

# Tokenize and preprocess the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['clean_text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get document embeddings

def get_doc_embedding(tokens):
    word_embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if word_embeddings:
        doc_embedding = sum(word_embeddings) / len(word_embeddings)
    else:
        doc_embedding = [0] * 100  # If no word found in the vocabulary, return zero vector
    return doc_embedding

# Get document embeddings for all documents
data['doc_embedding'] = data['clean_text'].apply(get_doc_embedding)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['doc_embedding'].to_list(), data['target'], test_size=0.2, random_state=42)

# Train Logistic Regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Train Decision Tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Evaluate models
log_reg_pred = log_reg_model.predict(X_test)
tree_pred = tree_model.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, log_reg_pred))

print("Decision Tree:")
print(classification_report(y_test, tree_pred))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       779
           1       1.00      0.99      0.99       381

    accuracy                           1.00      1160
   macro avg       1.00      0.99      1.00      1160
weighted avg       1.00      1.00      1.00      1160

Decision Tree:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       779
           1       0.98      0.96      0.97       381

    accuracy                           0.98      1160
   macro avg       0.98      0.97      0.98      1160
weighted avg       0.98      0.98      0.98      1160



In [5]:
data['doc_embedding']


0       [-0.13293235, 0.5989273, -0.10993297, -0.51946...
1       [0.14040406, 0.62252957, -0.033200316, -0.1430...
2       [-0.11039918, 0.75404245, 0.31703705, -0.58244...
3       [-0.24569444, 0.13170093, 0.18354368, -0.09825...
4       [-0.19522151, 0.7204663, -0.07234885, -0.87194...
                              ...                        
5791    [-0.31374565, 0.52876866, 0.023394685, -0.7533...
5792    [-0.023130037, 0.80330676, 0.35306415, -0.6309...
5793    [0.03367258, 0.3827118, 0.058853943, 0.2070820...
5794    [-0.4109603, 0.25338852, 0.39777717, -1.117165...
5795    [-0.4162825, 0.46287644, 0.24279185, -1.114479...
Name: doc_embedding, Length: 5796, dtype: object