In [25]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import classification_report
# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex, removing emails and html tags
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    # preprocessed_text = ' '.join(lemmatized_tokens)
    return lemmatized_tokens

# Apply preprocessing to the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.4, random_state=43)

data.head(10)


Unnamed: 0,text,target,clean_text
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0,"[ilug, admin, linux, ie, mon, jul, return, pat..."
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1,"[gort, excite, com, mon, jun, return, path, go..."
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1,"[fork, admin, xent, com, mon, jul, return, pat..."
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1,"[dcm, btamail, net, cn, mon, jun, return, path..."
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0,"[ilug, admin, linux, ie, mon, aug, return, pat..."
5,From tobaccodemon@terra.es Sat Sep 7 22:05:58 ...,1,"[tobaccodemon, terra, e, sat, sep, return, pat..."
6,From larlar78@MailOps.Com Sat Jun 30 00:19:08 ...,1,"[larlar, mailops, com, sat, jun, return, path,..."
7,From rpm-list-admin@freshrpms.net Thu Jul 25 1...,0,"[rpm, list, admin, freshrpms, net, thu, jul, r..."
8,From exmh-users-admin@redhat.com Wed Aug 7 06:...,0,"[exmh, user, admin, redhat, com, wed, aug, ret..."
9,From contractor@goldenbay.com.cy Tue Jul 23 23...,1,"[contractor, goldenbay, com, cy, tue, jul, ret..."


In [26]:
print(data['text'][0])
print('----------------------------')
print(data['clean_text'][0])

----------------------------


In [28]:

# Create TaggedDocument objects
tagged_data = [TaggedDocument(words=text, tags=[str(i)]) for i, text in enumerate(data['clean_text'])]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Define embedding techniques
embedding_techniques = {
    'Word2vec': Word2Vec(sentences=data['clean_text'], vector_size=100, window=5, min_count=1, workers=4),
    'doc2vec':doc2vec_model,
    'Bag of Words': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
    
}

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}


# Function to get document embeddings for Word2vec
def get_doc_embedding_Word2vec(tokens):
    word_embeddings = [embedding_techniques['Word2vec'].wv[word] for word in tokens if word in embedding_techniques['Word2vec'].wv]
    if word_embeddings:
        doc_embedding = sum(word_embeddings) / len(word_embeddings)
    else:
        doc_embedding = [0] * 100  
    return doc_embedding

# Function to get document embeddings for doc2vec
def get_doc_embedding_doc2vec(tokens):
    return doc2vec_model.infer_vector(tokens)
    

#for Bag of Words and TF-IDF training this join the words of each email
X_train_Joining = [' '.join(row) for row in X_train]  # Joining rows
X_test_Joining = [' '.join(row) for row in X_test]  # Joining rows

results = []

# Loop through classifiers and embedding techniques
for clf_name, clf in classifiers.items():
    for vectorizer_name, vectorizer in embedding_techniques.items():
        if vectorizer_name == 'Word2vec':
            X_train_vectorized = X_train.apply(get_doc_embedding_Word2vec).to_list()
            X_test_vectorized = X_test.apply(get_doc_embedding_Word2vec).to_list()
        elif vectorizer_name == 'doc2vec':
            X_train_vectorized = X_train.apply(get_doc_embedding_doc2vec).to_list()
            X_test_vectorized = X_test.apply(get_doc_embedding_doc2vec).to_list()
        else :
        # Vectorize the data
            X_train_vectorized = vectorizer.fit_transform(X_train_Joining)
            X_test_vectorized = vectorizer.transform(X_test_Joining)
        
        # Training
        clf.fit(X_train_vectorized, y_train)
    
        # Predictions on test set
        predictions = clf.predict(X_test_vectorized)
        
        # Model evaluation
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        
        # Store results
        results.append({
            'Model': clf_name,
            'Embedding Technique': vectorizer_name,
            'Accuracy': accuracy,
            'F1-Score': f1
        })

# Create dataframe from results
results_df = pd.DataFrame(results)

# Print summary
print(results_df)

                 Model Embedding Technique  Accuracy  F1-Score
0  Logistic Regression            Word2vec  0.987495  0.987464
1  Logistic Regression             doc2vec  0.974558  0.974477
2  Logistic Regression        Bag of Words  0.996119  0.996117
3  Logistic Regression              TF-IDF  0.985339  0.985254
4        Decision Tree            Word2vec  0.974127  0.974183
5        Decision Tree             doc2vec  0.868047  0.868377
6        Decision Tree        Bag of Words  0.978008  0.977987
7        Decision Tree              TF-IDF  0.978439  0.978485
