In [3]:
 import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Apply preprocessing to the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.2, random_state=300)

# Tagging documents for Doc2Vec
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X_train)]

# Train Doc2Vec model
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# Generate document vectors
X_train_vec = [model.infer_vector(word_tokenize(text.lower())) for text in X_train]
X_test_vec = [model.infer_vector(word_tokenize(text.lower())) for text in X_test]

# Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_vec, y_train)
lr_pred = lr_model.predict(X_test_vec)

# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_vec, y_train)
dt_pred = dt_model.predict(X_test_vec)

# Evaluating the models
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='macro')
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred, average='macro')

print("Logistic Regression Model:")
print("Accuracy:", lr_accuracy)
print("F1 Score:", lr_f1)

print("\nDecision Tree Model:")
print("Accuracy:", dt_accuracy)
print("F1 Score:", dt_f1)


KeyboardInterrupt: 