In [21]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# Load the dataset
data = pd.read_csv("Spam_Email_Data.csv")

# Data Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and numbers using regex
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Apply preprocessing to the 'text' column
data['clean_text'] = data['text'].apply(preprocess_text)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['target'], test_size=0.2, random_state=43)

data.head(10)


Unnamed: 0,text,target,clean_text
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0,ilug admin linux ie mon jul return path ilug a...
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1,gort excite com mon jun return path gort excit...
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1,fork admin xent com mon jul return path fork a...
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1,dcm btamail net cn mon jun return path dcm bta...
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0,ilug admin linux ie mon aug return path ilug a...
5,From tobaccodemon@terra.es Sat Sep 7 22:05:58 ...,1,tobaccodemon terra e sat sep return path tobac...
6,From larlar78@MailOps.Com Sat Jun 30 00:19:08 ...,1,larlar mailops com sat jun return path larlar ...
7,From rpm-list-admin@freshrpms.net Thu Jul 25 1...,0,rpm list admin freshrpms net thu jul return pa...
8,From exmh-users-admin@redhat.com Wed Aug 7 06:...,0,exmh user admin redhat com wed aug return path...
9,From contractor@goldenbay.com.cy Tue Jul 23 23...,1,contractor goldenbay com cy tue jul return pat...


In [22]:
print(data['text'][0])
print('----------------------------')
print(data['clean_text'][0])

----------------------------


In [23]:
from gensim.models import Word2Vec
# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

# Define embedding techniques
embedding_techniques1 = {
    'Bag of Words': CountVectorizer(),
    'TF-IDF': TfidfVectorizer(),
    'Word2vec': Word2Vec(sentences=data['clean_text'], vector_size=100, window=5, min_count=1, workers=4)
    
}

def get_doc_embedding(tokens):
    word_embeddings = [embedding_techniques1['Word2vec'].wv[word] for word in tokens if word in embedding_techniques1['Word2vec'].wv]
    if word_embeddings:
        doc_embedding = sum(word_embeddings) / len(word_embeddings)
    else:
        doc_embedding = [0] * 100  # If no word found in the vocabulary, return zero vector
    return doc_embedding

results = []
# Loop through classifiers and embedding techniques
for clf_name, clf in classifiers.items():
    for vectorizer_name, vectorizer in embedding_techniques1.items():
        if vectorizer_name == 'Word2vec':
            X_train_vectorized = X_train.apply(get_doc_embedding).to_list()
            X_test_vectorized = X_test.apply(get_doc_embedding).to_list()
        else:
        # Vectorize the data
            X_train_vectorized = vectorizer.fit_transform(X_train)
            X_test_vectorized = vectorizer.transform(X_test)
        
        # Training
        clf.fit(X_train_vectorized, y_train)
        # Predictions on test set
        predictions = clf.predict(X_test_vectorized)
        
        # Model evaluation
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        
        # Store results
        results.append({
            'Model': clf_name,
            'Embedding Technique': vectorizer_name,
            'Accuracy': accuracy,
            'F1-Score': f1
        })

# Create dataframe from results
results_df = pd.DataFrame(results)

# Print summary
print(results_df)

                 Model Embedding Technique  Accuracy  F1-Score
0  Logistic Regression        Bag of Words  0.994828  0.994824
1  Logistic Regression              TF-IDF  0.984483  0.984390
2  Logistic Regression            Word2vec  0.868966  0.863179
3        Decision Tree        Bag of Words  0.979310  0.979296
4        Decision Tree              TF-IDF  0.971552  0.971521
5        Decision Tree            Word2vec  0.908621  0.908423
