In [3]:
 import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

# Tokenize and preprocess the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['clean_text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get document embeddings

def get_doc_embedding(tokens):
    word_embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if word_embeddings:
        doc_embedding = sum(word_embeddings) / len(word_embeddings)
    else:
        doc_embedding = [0] * 100  # If no word found in the vocabulary, return zero vector
    return doc_embedding

# Get document embeddings for all documents
data['doc_embedding'] = data['clean_text'].apply(get_doc_embedding)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['doc_embedding'].to_list(), data['target'], test_size=0.2, random_state=42)

# Train Logistic Regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Train Decision Tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Evaluate models
log_reg_pred = log_reg_model.predict(X_test)
tree_pred = tree_model.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, log_reg_pred))

print("Decision Tree:")
print(classification_report(y_test, tree_pred))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       779
           1       1.00      0.98      0.99       381

    accuracy                           0.99      1160
   macro avg       0.99      0.99      0.99      1160
weighted avg       0.99      0.99      0.99      1160

Decision Tree:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       779
           1       0.95      0.97      0.96       381

    accuracy                           0.97      1160
   macro avg       0.97      0.97      0.97      1160
weighted avg       0.97      0.97      0.97      1160



In [2]:
data['doc_embedding']

0       [-0.40194812, 0.6850408, -0.21529862, -0.34014...
1       [-0.14532085, 0.6477422, -0.10622812, 0.085225...
2       [-0.3384908, 0.74088675, 0.14835992, -0.384388...
3       [-0.5410258, 0.015514152, 0.017502993, -0.1372...
4       [-0.46610278, 0.8231711, -0.20361237, -0.66862...
                              ...                        
5791    [-0.6803774, 0.5955472, -0.0662018, -0.6343247...
5792    [-0.19682094, 0.72839934, 0.27339488, -0.44882...
5793    [-0.010830138, 0.33242434, -0.13595971, 0.0442...
5794    [-0.58353204, 0.17688018, 0.1941054, -0.979490...
5795    [-0.68748397, 0.25473619, 0.1906854, -0.859841...
Name: doc_embedding, Length: 5796, dtype: object